diff --git a/CMakeLists.txt b/CMakeLists.txt
index 265ddc9504167f21f54a1b1e7777147b3b6d37d9..fb796103350ac4403d4151cf08eb4315bcde68fd 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,10 @@ include(generic)            # simplify cmake module
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
-
+option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN"        OFF)
+if (WITH_GPU  AND WITH_XPU)
+    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
+endif()
 # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
 if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
     message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
diff --git a/Dockerfile b/Dockerfile
index 42a103240e882b2732f14619308cc00f010d20af..b92ac228a8d50da93f8c0bfe2f6af31fc784f2c7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 
-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 
@@ -149,21 +148,11 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
 # version util jupyter fixes this issue.
 
-# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
-# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
-# version(1.7.1 for now), which causes building documentation failed.
+
 RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
@@ -184,9 +173,9 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
 RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
 RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
 
-RUN pip3 --no-cache-dir install coverage                
-RUN pip3.6 --no-cache-dir install coverage             
-RUN pip3.7 --no-cache-dir install coverage            
+RUN pip3 --no-cache-dir install coverage
+RUN pip3.6 --no-cache-dir install coverage
+RUN pip3.7 --no-cache-dir install coverage
 RUN pip --no-cache-dir install coverage
 
 COPY ./python/requirements.txt /root/
@@ -204,12 +193,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure]
 
 
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
 
 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index bb57b42dcc74114312a400a0f6cc95df307de6bb..cf458d97706755e794c5fbb1ba9d3fcb51e9d1ce 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -63,6 +63,11 @@ if(WITH_BOX_PS)
     add_definitions(-DPADDLE_WITH_BOX_PS)
 endif()
 
+if(WITH_XPU)
+    message(STATUS "Compile with XPU!")
+    add_definitions(-DPADDLE_WITH_XPU)
+endif()
+
 if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
     add_definitions(-DEIGEN_USE_GPU)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index bb92eae732e1ebe98dc05890d90bf8b53d8b41a9..b7a93cd9ee2160090c0142d62d96da72e4c58717 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -61,6 +61,10 @@ function(detect_installed_gpus out_variable)
   if(NOT CUDA_gpu_detect_output)
     message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
     set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+    #Todo: fix Automatic GPU detection failed on windows
+    if(WIN32)
+      set(${out_variable} "61 75" PARENT_SCOPE)
+    endif()
   else()
     set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
   endif()
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index 4a343b2c6af2ce64d65203ae5955b2d552055198..6f790f1af8e1a03d1101244a4d82045331b44c13 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -17,7 +17,7 @@ include(ExternalProject)
 set(CUB_PREFIX_DIR ${THIRD_PARTY_PATH}/cub)
 set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub/src/extern_cub)
 set(CUB_REPOSITORY https://github.com/NVlabs/cub.git)
-set(CUB_TAG        1.9.8)
+set(CUB_TAG        1.8.0)
 
 cache_third_party(extern_cub
     REPOSITORY    ${CUB_REPOSITORY}
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index b541d73bc6a633d8e6a77ff567d756f3b40bfce9..8a655b2954dea5d6b864616ed2f4d19b167c4be8 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG 42ab4d559f6659edfc35040fb30fdcec3dc3f8aa)
+    set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa)
   endif()
 
   if(NOT CUDA_ARCH_NAME)
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8a927d8e282a03e8a74c0814ee8d9b247451a091
--- /dev/null
+++ b/cmake/external/xpu.cmake
@@ -0,0 +1,54 @@
+if (NOT WITH_XPU)
+    return()
+endif()
+
+INCLUDE(ExternalProject)
+SET(XPU_PROJECT                 "extern_xpu")
+SET(XPU_URL    "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
+SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
+SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
+SET(XPU_API_INC_DIR             "${THIRD_PARTY_PATH}/install/xpu/api/include")
+SET(XPU_RUNTIME_INC_DIR         "${THIRD_PARTY_PATH}/install/xpu/runtime/include")
+SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
+
+SET(XPU_API_LIB_NAME            "libxpuapi.so")
+SET(XPU_RT_LIB_NAME             "libxpurt.so")
+SET(XPU_SIM_LIB_NAME            "libxpusim.so")
+SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
+SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
+SET(XPU_SIM_LIB                 "${XPU_LIB_DIR}/${XPU_SIM_LIB_NAME}")
+
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
+
+INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
+INCLUDE_DIRECTORIES(${XPU_RUNTIME_INC_DIR})
+
+FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(XPU)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY xpu/api xpu/runtime xpu/lib \n"
+  "        DESTINATION ${XPU_INSTALL_DIR})\n")
+
+ExternalProject_Add(
+    ${XPU_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${XPU_SOURCE_DIR}
+    DOWNLOAD_DIR          ${XPU_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
+                          && tar xvf xpu.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
+set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
+
+# generate a static dummy target to track xpulib dependencies
+# for cc_library(xxx SRCS xxx.c DEPS xpulib)
+generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
+
+TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_SIM_LIB})
+ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 8842e8e21c6df224bb6341a4f7f526e3d61e92e1..1956e5c39ea2524d8a8e2650eb08f8d58f410b73 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -384,8 +384,12 @@ function(cc_test_run TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    # No unit test should exceed 10 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    # No unit test should exceed 2 minutes.
+    if (APPLE OR WIN32)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    else()
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
+    endif()
   endif()
 endfunction()
 
@@ -742,9 +746,14 @@ function(py_test TARGET_NAME)
                ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endif()
+    
+    if (APPLE OR WIN32)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    else()
+        # No unit test should exceed 2 minutes in Linux.
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
+    endif()
 
-    # No unit test should exceed 10 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
   endif()
 endfunction()
 
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 5a889dbc3143833ff48a972d17efc0aaf63f1810..20f27715e00457a8fe43f5c620e2a005387d7988 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -110,10 +110,12 @@ function(copy_part_of_thrid_party TARGET DST)
             SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
             DSTS ${dst_dir} ${dst_dir}/lib)
 
+    if (WITH_CRYPTO)
         set(dst_dir "${DST}/third_party/install/cryptopp")
         copy(${TARGET}
-        SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib)
+            SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
+            DSTS ${dst_dir} ${dst_dir}/lib)
+    endif()
 
     set(dst_dir "${DST}/third_party/install/xxhash")
     copy(${TARGET}
@@ -187,7 +189,7 @@ copy(inference_lib_dist
         SRCS  ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
         DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal)
 copy(inference_lib_dist
-        SRCS  ${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io/crypto/cipher.h
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h
         DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
 include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index e927fae63f0fc28902431c7b09350c7f7d10c52a..f60a6dc3f0c89dd345b04ea3a1e213de770e5760 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -8,6 +8,7 @@ function(op_library TARGET)
     set(hip_cu_srcs)
     set(miopen_hip_cc_srcs)
     set(cu_cc_srcs)
+    set(xpu_cc_srcs)
     set(cudnn_cu_cc_srcs)
     set(cudnn_cu_srcs)
     set(CUDNN_FILE)
@@ -60,6 +61,12 @@ function(op_library TARGET)
                 list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
             endif()
         endif()
+        if(WITH_XPU)
+            string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
+                list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
+            endif()
+        endif()
     else()
         foreach(src ${op_library_SRCS})
             if (${src} MATCHES ".*\\.hip.cu$")
@@ -76,6 +83,8 @@ function(op_library TARGET)
                 list(APPEND mkldnn_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cu.cc$")
                 list(APPEND cu_cc_srcs ${src})
+            elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
+                list(APPEND xpu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
                 list(APPEND cc_srcs ${src})
             else()
@@ -109,7 +118,7 @@ function(op_library TARGET)
         hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     else()
-        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
             ${op_common_deps})
     endif()
 
@@ -150,10 +159,11 @@ function(op_library TARGET)
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
     list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
     list(LENGTH hip_cu_srcs hip_cu_srcs_len)
     list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
     if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
     endif()
@@ -179,6 +189,9 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
     endif()
 
+    if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
+    endif()
     # pybind USE_OP_DEVICE_KERNEL for MKLDNN
     if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
       # Append first implemented MKLDNN activation operator
@@ -228,6 +241,7 @@ function(register_operators)
 
     file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
     string(REPLACE "_mkldnn" "" OPS "${OPS}")
+    string(REPLACE "_xpu" "" OPS "${OPS}")
     string(REPLACE ".cc" "" OPS "${OPS}")
     list(REMOVE_DUPLICATES OPS)
     list(LENGTH register_operators_DEPS register_operators_DEPS_len)
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 5b2c0f51cd745ff8b39e7757a9832955ec6aeb5a..c9442e8f843ac152cac02908799a8d24f5951e58 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -250,6 +250,11 @@ if(WITH_GPU)
     file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
 endif(WITH_GPU)
 
+if(WITH_XPU)
+    include(external/xpu)          # download, build, install xpu
+    list(APPEND third_party_deps extern_xpu)
+endif(WITH_XPU)
+
 if(WITH_PSLIB)
     include(external/pslib)          # download, build, install pslib
     list(APPEND third_party_deps extern_pslib)
diff --git a/go/paddle/config.go b/go/paddle/config.go
index cea69e716bffada9e5565eacf8ac1af84ae5b930..c4f39fa9c5d627a689c064bbbd2178cd1ae1a929 100644
--- a/go/paddle/config.go
+++ b/go/paddle/config.go
@@ -154,10 +154,17 @@ func (config *AnalysisConfig) EnableMkldnnQuantizer() {
 	C.PD_EnableMkldnnQuantizer(config.c)
 }
 
+func (config *AnalysisConfig) EnableMkldnnBfloat16() {
+	C.PD_EnableMkldnnBfloat16(config.c)
+}
+
 func (config *AnalysisConfig) MkldnnQuantizerEnabled() bool {
 	return ConvertCBooleanToGo(C.PD_MkldnnQuantizerEnabled(config.c))
 }
 
+func (config *AnalysisConfig) MkldnnBfloat16Enabled() bool {
+	return ConvertCBooleanToGo(C.PD_MkldnnBfloat16Enabled(config.c))
+}
 // SetModelBuffer
 // ModelFromMemory
 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 8c49a5f6de7706eadd6afc1e6e55701dbfe9304d..9d5c0cc7048f7db539c090d28c6184ac6d72d75a 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -119,9 +119,13 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_
 cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
         framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
 
-cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
+cc_library(attribute SRCS attribute.cc DEPS framework_proto boost enforce)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
 device_context)
+
+cc_library(op_version_registry SRCS op_version_registry.cc DEPS framework_proto boost)
+cc_test(op_version_registry_test SRCS op_version_registry_test.cc DEPS op_version_registry)
+
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(no_need_buffer_vars_inference SRCS no_need_buffer_vars_inference.cc DEPS attribute device_context)
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 0bd6a79b55392e8bfb8f33b0a29b4cf1df0d44dc..5574a55e18c6d9806cb878dc69ec597f81da97d8 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -117,7 +117,7 @@ static void TransData(const framework::LoDTensor &src_item,
       TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
     } else {
-      dst_item->ShareDataWith(src_item);
+      TensorCopy(src_item, platform::CPUPlace(), dst_item);
     }
   } else {
     dst_item->clear();
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 4b984210ed18d9f51d9485616d1c28871d936237..551d1342edeb335d1cad4782f85ae9f94f8739bd 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -113,7 +113,9 @@ message DistributedStrategy {
   optional bool fuse_all_reduce_ops = 18 [ default = true ];
   optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
   optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
-  // optional bool enable_backward_optimizer_op_deps = 19 [ default = true ];
+  optional bool cudnn_exhaustive_search = 21 [ default = true ];
+  optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
+  optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index f2421248e33f236b9fa861f22ce4848531cf1791..180b33d0cb72e2c4c9e6e8caff9f0ef5f1b04689 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
     return ctx;
   }
 
+  inline ::DLContext operator()(const platform::XPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::XPUPlace is not supported"));
+  }
+
   inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
     ::DLContext ctx;
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 8e2e1d38a66d1039519bab312f77bef6604d8ec1..f11edb9a41bdcbcb33efc600f1d7d8f70fb76f45 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;
   if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
-#ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
+#ifdef PADDLE_WITH_CUDA
       if (IsFastEagerDeletionModeEnabled()) {
         gc.reset(new UnsafeFastGPUGarbageCollector(
             BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
@@ -453,13 +453,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
         gc.reset(new DefaultStreamGarbageCollector(
             BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
       }
-    } else if (platform::is_cpu_place(place_)) {
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
 #endif
+    } else if (platform::is_cpu_place(place_)) {
       gc.reset(new CPUGarbageCollector(
           BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
-#ifdef PADDLE_WITH_CUDA
-    }
+    } else if (platform::is_xpu_place(place_)) {
+#ifdef PADDLE_WITH_XPU
+      gc.reset(new XPUGarbageCollector(
+          BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
 #endif
+    }
   }
 
   for (int64_t i = start_op_index; i < end_op_index; ++i) {
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h
index 3f932ee226ca85b86c92fef1b30420d782d9bc62..758cde78530d7b334a7100bce6ce32c2869cc066 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.h
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.h
@@ -105,6 +105,11 @@ enum GlooStoreType { HDFS, HTTP };
 
 class GlooWrapper {
  public:
+  static std::shared_ptr<GlooWrapper> GetInstance() {
+    static auto s_instance = std::make_shared<GlooWrapper>();
+    return s_instance;
+  }
+
   GlooWrapper() {}
 
   virtual ~GlooWrapper() {}
@@ -153,6 +158,11 @@ class GlooWrapper {
 #endif
   }
 
+  bool IsInitialized() { return is_initialized_; }
+#ifdef PADDLE_WITH_GLOO
+  std::shared_ptr<gloo::Context> GetContext() { return context_; }
+#endif
+
   template <typename T>
   std::vector<T> AllReduce(std::vector<T>& sendbuf,            // NOLINT
                            const std::string& mode = "sum") {  // NOLINT
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 84b5502ff7b369452e7c9988d185450934c78b03..29312370b3448bfe3c04b914ce0748eb1a66cf32 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -115,6 +115,7 @@ message VarType {
     SIZE_T = 19;
     UINT8 = 20;
     INT8 = 21;
+    BF16 = 22;
 
     // Other types that may need additional descriptions
     LOD_TENSOR = 7;
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index ac892443de36cf6d37d56da761fb3d60628a5e4a..f69ada080676cddfa4f31c6cbc450b8eca28b3ac 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
   callback();
 }
 
+#ifdef PADDLE_WITH_XPU
+XPUGarbageCollector::XPUGarbageCollector(const platform::XPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
     const platform::CUDAPlace &place, size_t max_memory_size)
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 2212122c03de3416c91fcc46bf510bbc02d4302e..4f7739652822b9047b1798b6bd66261effbe2f49 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector {
   void ClearCallback(const std::function<void()> &callback) override;
 };
 
+#ifdef PADDLE_WITH_XPU
+class XPUGarbageCollector : public GarbageCollector {
+ public:
+  XPUGarbageCollector(const platform::XPUPlace &place, size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
  public:
diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
index d00e38784c2c0415a59a33fc24d708c253481c21..9bde9e20b19a0b14ce4489b91d9ab3d5273f7f9a 100644
--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -12,67 +12,122 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/generator.h"
+
+#include <glog/logging.h>
+
 #include <deque>
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 
-#include "paddle/fluid/framework/generator.h"
-
 namespace paddle {
 namespace framework {
 
-std::shared_ptr<Generator> Generator::gen_instance_ = NULL;
+const std::shared_ptr<Generator>& DefaultCPUGenerator() {
+  static auto default_cpu_generator =
+      std::make_shared<Generator>(GetRandomSeed());
+  VLOG(4) << "initial seed: " << default_cpu_generator->GetCurrentSeed()
+          << ", cpu engine: " << default_cpu_generator->GetCPUEngine().get();
+  return default_cpu_generator;
+}
+
+std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine() {
+  static auto op_default_cpu_engine = std::make_shared<std::mt19937_64>();
+  return op_default_cpu_engine;
+}
+
+// NOTE(zhiqiu): there are 3 conditions:
+// (1) op seed is not set and DefaultCPUGenerator is inited, use
+// DefaultCPUGenerator
+// (2) op seed is not set and DefaultCPUGenerator is not inited, use se
+// OpDefaultCPUEngine() and set a radnom seed
+// (3) op seed is set, use OpDefaultCPUEngine() and set the seed
+std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t seed) {
+  if (DefaultCPUGenerator()->GetIsInitPy() && seed == 0) {
+    VLOG(4) << "Use random engine from generator";
+    return DefaultCPUGenerator()->GetCPUEngine();
+  } else {
+    // NOTE(zhiqiu): creating an engine instance everytime instead of using
+    // OpDefaultCPUEngine(), this is the legacy behavior of random operators.
+    // The benefit is that when runing PE with fixed-seed in multiple thrads,
+    // each thread has their own engine, and doesn't affect each other.
+    //
+    // And we need to measure the determinacy of Generator in PE.
+    auto engine = std::make_shared<std::mt19937_64>();
+    if (seed == 0) {
+      seed = GetRandomSeed();
+      VLOG(4) << "Use default random engine with random seed = " << seed;
+    } else {
+      VLOG(4) << "Use default random engine with fixed random seed = " << seed;
+    }
+    static std::mutex mu_;
+    {
+      std::lock_guard<std::mutex> lock(mu_);
+      engine->seed(seed);
+    }
+    return engine;
+  }
+}
 
-GeneratorState* Generator::GetState() {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  return this->state_.get();
+GeneratorState Generator::GetState() {
+  std::lock_guard<std::mutex> lock(this->mu_);
+  state_.cpu_engine = *engine_;
+  return this->state_;
 }
 
-void Generator::SetState(GeneratorState* state_in) {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  *this->state_ = *state_in;
+void Generator::SetState(const GeneratorState& state) {
+  std::lock_guard<std::mutex> lock(this->mu_);
+  this->state_ = state;
+  this->engine_ = std::make_shared<std::mt19937_64>(state.cpu_engine);
 }
 
 uint64_t Generator::GetCurrentSeed() {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  return this->state_->current_seed;
+  std::lock_guard<std::mutex> lock(this->mu_);
+  return this->state_.current_seed;
 }
 
 uint64_t Generator::Seed() {
-  std::lock_guard<std::mutex> lock(this->mutex);
+  std::lock_guard<std::mutex> lock(this->mu_);
   uint64_t seed;
   std::random_device de;
   seed = ((((uint64_t)de()) << 32) + de()) & 0x1FFFFFFFFFFFFF;
-  this->state_->current_seed = seed;
+  this->state_.current_seed = seed;
   std::seed_seq seq({seed});
-  this->state_->cpu_engine.seed(seq);
+  this->engine_->seed(seq);
 
-  return this->state_->current_seed;
+  return this->state_.current_seed;
 }
 
 void Generator::SetCurrentSeed(uint64_t seed) {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  this->state_->current_seed = uint64_t(seed);
+  std::lock_guard<std::mutex> lock(this->mu_);
+  this->state_.current_seed = seed;
   std::seed_seq seq({seed});
-  this->state_->cpu_engine.seed(seq);
+  this->engine_->seed(seq);
 }
 
-std::mt19937_64& Generator::GetCPUEngine() {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  return this->state_->cpu_engine;
+std::shared_ptr<std::mt19937_64> Generator::GetCPUEngine() {
+  std::lock_guard<std::mutex> lock(this->mu_);
+  return this->engine_;
 }
 
-void Generator::SetCPUEngine(std::mt19937_64 engine) {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  this->state_->cpu_engine = std::mt19937_64(engine);
+void Generator::SetCPUEngine(std::shared_ptr<std::mt19937_64> engine) {
+  std::lock_guard<std::mutex> lock(this->mu_);
+  this->engine_ = engine;
 }
 
 uint64_t Generator::Random64() {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  return this->state_->cpu_engine();
+  std::lock_guard<std::mutex> lock(this->mu_);
+  auto engine = this->engine_;
+  return (*engine)();
+}
+
+void Generator::SetIsInitPy(bool is_init_py) {
+  this->is_init_py_ = is_init_py;
+  VLOG(4) << "SetIsInitPy:" << this->is_init_py_;
 }
+bool Generator::GetIsInitPy() const { return this->is_init_py_; }
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
index 17870782ba72a3247de734642962ffec48c0c91e..82b35f7ad550e770e8d10457ddf6cdf8e6fbd709 100644
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <glog/logging.h>
 #include <stdint.h>
+
 #include <atomic>
 #include <deque>
 #include <iostream>  // temp for debug
@@ -27,6 +29,12 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+static uint64_t GetRandomSeed() {
+  std::random_device rd;
+  // double has 53 bit significant, so limit uint64 to 53 bits
+  return ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
+}
+
 struct GeneratorState {
   int64_t device = -1;
   uint64_t current_seed = 34342423252;
@@ -35,62 +43,67 @@ struct GeneratorState {
 
 struct Generator {
   Generator() {
-    GeneratorState default_gen_state_cpu;
-    default_gen_state_cpu.device = -1;
-    default_gen_state_cpu.current_seed = 34342423252;
-    std::seed_seq seq({34342423252});
-    default_gen_state_cpu.cpu_engine = std::mt19937_64(seq);
-    this->state_ = std::make_shared<GeneratorState>(default_gen_state_cpu);
+    auto seed = GetRandomSeed();
+    std::seed_seq seq({seed});
+    auto engine = std::make_shared<std::mt19937_64>(seq);
+    this->state_.cpu_engine = *engine;
+    this->state_.device = -1;
+    this->state_.current_seed = seed;
+    this->engine_ = engine;
+    VLOG(4) << "initial seed: " << this->state_.current_seed
+            << ", cpu engine: " << &this->state_.cpu_engine;
+  }
+  explicit Generator(uint64_t seed) {
+    std::seed_seq seq({seed});
+    auto engine = std::make_shared<std::mt19937_64>(seq);
+    this->state_.cpu_engine = *engine;
+    this->state_.device = -1;
+    this->state_.current_seed = seed;
+    this->engine_ = engine;
+    VLOG(4) << "initial seed: " << this->state_.current_seed
+            << ", cpu engine: " << &this->state_.cpu_engine;
+    this->is_init_py_ = true;  // TODO(zhiqiu): remove it in future
   }
-  explicit Generator(GeneratorState state_in)
-      : state_{std::make_shared<GeneratorState>(state_in)} {}
-  Generator(const Generator& other)
-      : Generator(other, std::lock_guard<std::mutex>(other.mutex)) {}
+  Generator(const Generator& other) = delete;
 
   // get random state
-  GeneratorState* GetState();
+  GeneratorState GetState();
   // set random state
-  void SetState(GeneratorState* state_in);
+  void SetState(const GeneratorState&);
   // get current seed
   uint64_t GetCurrentSeed();
   // random a seed and get
   uint64_t Seed();
-
   // set seed
   void SetCurrentSeed(uint64_t seed);
   // get cpu engine
-  std::mt19937_64& GetCPUEngine();
+  std::shared_ptr<std::mt19937_64> GetCPUEngine();
   // set cpu engine
-  void SetCPUEngine(std::mt19937_64 engine);
+  void SetCPUEngine(std::shared_ptr<std::mt19937_64>);
 
   uint64_t Random64();
 
-  bool is_init_py = false;
+  void SetIsInitPy(bool);
+  bool GetIsInitPy() const;
 
-  // CPU Generator singleton
-  static std::shared_ptr<Generator> GetInstance() {
-    if (NULL == gen_instance_) {
-      gen_instance_.reset(new paddle::framework::Generator());
-    }
-    return gen_instance_;
-  }
+ private:
+  GeneratorState state_;
+  std::shared_ptr<std::mt19937_64> engine_;
+  mutable std::mutex mu_;
+
+  // NOTE(zhiqiu): is_init_py_ is used to make generator be compatible with
+  // old seed, and it should be removed after all random-related operators
+  // and unittests upgrades to use generator.
+  bool is_init_py_ = false;
+};
 
-  static std::shared_ptr<Generator> GetInstanceX() {
-    if (NULL == gen_instance_) {
-      gen_instance_.reset(new paddle::framework::Generator());
-    }
-    gen_instance_->is_init_py = true;
-    return gen_instance_;
-  }
+// The DefaultCPUGenerator is used in manual_seed()
+const std::shared_ptr<Generator>& DefaultCPUGenerator();
 
- private:
-  static std::shared_ptr<Generator> gen_instance_;
-  std::shared_ptr<GeneratorState> state_;
-  mutable std::mutex mutex;
+// If op seed is set or global is not set, the OpDefaultCPUEngine is used.
+std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine();
 
-  Generator(const Generator& other, const std::lock_guard<std::mutex>&)
-      : state_(std::make_shared<GeneratorState>(*(other.state_))) {}
-};
+std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index ff6dffa704eeceeabfc5eb1d6786f40b2e523e98..3d65fe595373fa98ba237f04134c75d4a60a7242 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1879,6 +1879,19 @@ PDNode *patterns::MultipleQuantize::operator()() {
   return prev_out;
 }
 
+PDNode *patterns::QuantizePlacement::operator()(
+    const std::unordered_set<std::string> &quantize_enabled_op_types) {
+  std::unordered_set<std::string> supported_op_types =
+      std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
+                                       "fc", "matmul", "pool2d", "prior_box",
+                                       "relu", "reshape2", "transpose2"});
+  if (!quantize_enabled_op_types.empty()) {
+    supported_op_types = quantize_enabled_op_types;
+  }
+  auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
+  return op;
+}
+
 PDNode *patterns::MKLDNNInPlace::operator()() {
   const std::unordered_set<std::string> &supported_op_types = {
       "abs",
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index e1cce7848dd54b02a540b144ca1088f62eeb52cb..0803265884165bc754489b18d07c0d277a4bd92b 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1120,6 +1120,15 @@ struct MultipleQuantize : public PatternBase {
   PATTERN_DECL_NODE(prev_out);
 };
 
+struct QuantizePlacement : public PatternBase {
+  QuantizePlacement(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "quantize_placement") {}
+  PDNode* operator()(
+      const std::unordered_set<std::string>& quantize_enabled_op_types);
+
+  PATTERN_DECL_NODE(op);
+};
+
 // Pattern used for enforcing inplace computation for in-place computation
 // supporting DNNL ops. softmax, batch_norm and layer_norm
 struct MKLDNNInPlace : public PatternBase {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 6be4ce566e01e9bcb89a38cbdc2bbd11551a065e..bc268a834780cad843a18a74bb7f50a639db103d 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -26,30 +26,33 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
   const auto& op_types_list =
       Get<std::unordered_set<std::string>>("quantize_enabled_op_types");
-  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp()) {
-      if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
-                    n->id()) != excluded_ids_list.end())
-        continue;
-      auto* op = n->Op();
-      if (op->HasAttr("mkldnn_data_type") ||
-          op->HasProtoAttr("mkldnn_data_type")) {
-        // use_quantizer is no longer used
-        // assign value for compatibility
-        if (op->GetAttrIfExists<bool>("use_quantizer")) {
-          op->SetAttr("mkldnn_data_type", std::string("int8"));
-        }
-        if (op_types_list.empty()) {
-          op->SetAttr("mkldnn_data_type", std::string("int8"));
-          op->SetAttr("use_quantizer", true);
-        } else if (std::find(op_types_list.begin(), op_types_list.end(),
-                             op->Type()) != op_types_list.end()) {
-          op->SetAttr("mkldnn_data_type", std::string("int8"));
-          op->SetAttr("use_quantizer", true);
-        }
+  Init(name_scope_, graph);
+  GraphPatternDetector gpd;
+  patterns::QuantizePlacement quantize_placement_pattern{gpd.mutable_pattern(),
+                                                         "quantize_placement"};
+  quantize_placement_pattern(op_types_list);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, quantize_placement_pattern);
+
+    if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
+                  op->id()) != excluded_ids_list.end()) {
+      return;
+    }
+
+    if (op->Op()->HasAttr("mkldnn_data_type") ||
+        op->Op()->HasProtoAttr("mkldnn_data_type")) {
+      // use_quantizer is no longer used
+      // assign value for compatibility
+      if (op->Op()->GetAttrIfExists<bool>("use_quantizer")) {
+        op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
       }
+      op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
+      op->Op()->SetAttr("use_quantizer", true);
     }
-  }
+  };
+  gpd(graph, handler);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
index 008a462dc414c04f53315a8f262de15ab8fb7fb5..f3229e59d6ffb97514adb9c871d4fb981fc964e0 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
@@ -15,7 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
-#include "paddle/fluid/framework/ir/pass.h"
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
 namespace framework {
@@ -23,9 +26,10 @@ namespace ir {
 /*
  * Specifies which operators should be quantized.
  */
-class CPUQuantizePlacementPass : public Pass {
+class CPUQuantizePlacementPass : public FusePassBase {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
+  const std::string name_scope_{"cpu_quantize_placement_pass"};
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
index 95e321e5b7190499f98c9df3dbef217310abcfcd..761defc25ff5c89b740ccd5adff7d613beccd9d4 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
@@ -130,7 +130,7 @@ TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
   MainTest({"conv2d"}, {4}, 1);
 }
 
-TEST(QuantizerPlacementPass, excluded_none) {
+TEST(QuantizerPlacementPass, empty_list) {
   // all operators quantized
   MainTest({}, {}, 6);
 }
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 4506c162fa743a3fcb5973a9f0ebd9e8f6cdcd36..56ae02d49ef522fbf243d8dbc62ee319cbba425b 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -81,7 +81,8 @@ void DeleteQuant(ir::Graph* graph, Scope* scope,
       if (quantized_op_type == "conv2d" ||
           quantized_op_type == "conv2d_fusion" ||
           quantized_op_type == "depthwise_conv2d" ||
-          quantized_op_type == "fc") {
+          quantized_op_type == "fc" ||
+          quantized_op_type == "conv2d_transpose") {
         op_desc->SetAttr("Input_scale", scale_value);
       } else if (quantized_op_type == "mul") {
         op_desc->SetAttr("X_scale", scale_value);
@@ -111,7 +112,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
   std::string input_name = "";
   if (quantized_op_type == "conv2d" ||
       quantized_op_type == "depthwise_conv2d" ||
-      quantized_op_type == "conv2d_fusion") {
+      quantized_op_type == "conv2d_fusion" ||
+      quantized_op_type == "conv2d_transpose") {
     weight_name = "Filter";
     input_name = "Input";
   } else if (quantized_op_type == "mul") {
@@ -122,7 +124,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
     input_name = "Input";
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
-        "QuantDequantFuse: We only support conv2d, conv2d_fusion, fc, mul for "
+        "QuantDequantFuse: We only support conv2d, conv2d_fusion, "
+        "conv2d_transpose, fc, mul for "
         "now."));
   }
   const std::string pattern_name = "dequant_fuse";
@@ -192,10 +195,12 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
         scope->Var(quantized_op_weight_node->Name())->GetMutable<LoDTensor>();
     auto w_dims = weight_tensor->dims();
     // If quantized op is fc, weight scale size = 1;
-    // If quantized op is conv, weight scale size = weight dims[0]
+    // If quantized op is conv2d, weight scale size = weight dims[0]
+    // If quantized op is conv2d_transpose, weight scale size = weight dims[1]
     bool valid_scale_size =
         (weight_scale.size() == 1 ||
-         weight_scale.size() == static_cast<size_t>(w_dims[0]));
+         weight_scale.size() == static_cast<size_t>(w_dims[0]) ||
+         weight_scale.size() == static_cast<size_t>(w_dims[1]));
     PADDLE_ENFORCE_EQ(
         valid_scale_size, true,
         platform::errors::InvalidArgument(
@@ -206,8 +211,14 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
       if (weight_scale.size() == 1) {
         quantized_weight_data[j] *= weight_scale[0];
       } else {
-        int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
-        quantized_weight_data[j] *= weight_scale[j / inner_size];
+        if (quantized_op_type == "conv2d_transpose") {
+          int inner_size = w_dims[2] * w_dims[3];
+          quantized_weight_data[j] *=
+              weight_scale[(j / inner_size) % w_dims[1]];
+        } else {
+          int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
+          quantized_weight_data[j] *= weight_scale[j / inner_size];
+        }
       }
     }
 
@@ -220,7 +231,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
     new_op_desc.SetType(quantized_op_type);
     new_op_desc.SetAttr("enable_int8", true);
     if (quantized_op_type == "conv2d" || quantized_op_type == "conv2d_fusion" ||
-        quantized_op_type == "depthwise_conv2d") {
+        quantized_op_type == "depthwise_conv2d" ||
+        quantized_op_type == "conv2d_transpose") {
       new_op_desc.SetInput("Input", {new_input});
       new_op_desc.SetOutput("Output", {new_output});
     } else if (quantized_op_type == "fc") {
@@ -253,7 +265,7 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
   std::unordered_set<std::string> quant_types = {
       "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
   std::unordered_set<std::string> quantized_op_types = {
-      "conv2d", "mul", "depthwise_conv2d", "fc"};
+      "conv2d", "mul", "depthwise_conv2d", "fc", "conv2d_transpose"};
   auto* scope = param_scope();
 
   for (auto& quant_type : quant_types) {
diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h
index d46f8a574c0d956dc0a90bc2741d2cb80313ab7f..4307e51862df572e013431fceaaf89cc1cf6679c 100644
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
     // CPU, CUDA, PLAIN are same library type.
   } else if (s == std::string("CPU")) {
     return LibraryType::kPlain;
+  } else if (s == std::string("XPU")) {
+    return LibraryType::kPlain;
   } else if (s == std::string("CUDA")) {
     return LibraryType::kPlain;
   } else {
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 78595e50b2da627065309041079839faa197cc8f..bccc92e5c4352927f309f3605bb3c8d8dd823bb5 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -78,21 +78,37 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
                 size_t j = 0) override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    PADDLE_ENFORCE_LT(i, Inputs(in).size(),
+                      platform::errors::InvalidArgument(
+                          "The input variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Inputs(in).size(), i));
+    PADDLE_ENFORCE_LT(j, Outputs(out).size(),
+                      platform::errors::InvalidArgument(
+                          "The output variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Outputs(out).size(), j));
+
     std::string input_n = Inputs(in)[i];
     std::string output_n = Outputs(out)[j];
 
-    PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@",
-                   in, i);
-    PADDLE_ENFORCE(output_n != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", out, j);
+    PADDLE_ENFORCE_NE(input_n, framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The input variable %s[%d] is empty.", in, i));
+    PADDLE_ENFORCE_NE(output_n, framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The output variable %s[%d] is empty.", out, j));
 
     auto *in_var = block_.FindVarRecursive(input_n);
     auto *out_var = block_.FindVarRecursive(output_n);
 
-    PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(),
-                   "The type of %s and %s is not the same.", input_n, output_n);
+    PADDLE_ENFORCE_EQ(
+        in_var->GetType(), out_var->GetType(),
+        platform::errors::InvalidArgument(
+            "The type of input %s and output %s do not match. The input type "
+            "is %s, output type is %s.",
+            input_n, output_n, DataTypeToString(in_var->GetType()),
+            DataTypeToString(out_var->GetType())));
 
     SetDim(output_n, GetDim(input_n));
   }
@@ -126,12 +142,22 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
                 size_t j = 0) const override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", in, i);
-    PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", out, j);
+    PADDLE_ENFORCE_LT(i, Inputs(in).size(),
+                      platform::errors::InvalidArgument(
+                          "The input variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Inputs(in).size(), i));
+    PADDLE_ENFORCE_LT(j, Outputs(out).size(),
+                      platform::errors::InvalidArgument(
+                          "The output variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Outputs(out).size(), j));
+    PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The input variable %s[%d] is empty.", in, i));
+    PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The output variable %s[%d] is empty.", out, j));
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
     if (in_var->GetType() != proto::VarType::LOD_TENSOR &&
@@ -144,30 +170,38 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   int32_t GetLoDLevel(const std::string &in, size_t i = 0) const override {
     PADDLE_ENFORCE_LT(i, Inputs(in).size(),
-                      "Input %s of operator %s only has %d elements.", in,
-                      op_.Type(), Inputs(in).size());
+                      platform::errors::InvalidArgument(
+                          "The input variable index is out of range, input "
+                          "variable %s of operator %s only has %d elements.",
+                          in, op_.Type(), Inputs(in).size()));
     PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName,
-                      "Input %s[%d] of operator %s is @EMPTY@", in, op_.Type(),
-                      i);
+                      platform::errors::InvalidArgument(
+                          "The input variable %s[%d] of operator %s is empty.",
+                          in, i, op_.Type()));
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     PADDLE_ENFORCE_NOT_NULL(
-        in_var, "Input %s[%d] of operator %s should not be nullptr.", in,
-        op_.Type(), i);
+        in_var, platform::errors::NotFound(
+                    "The input variable %s[%d] of operator %s is not found.",
+                    in, i, op_.Type()));
     return in_var->GetLoDLevel();
   }
 
   void SetLoDLevel(const std::string &out, int32_t lod_level,
                    size_t j = 0) const override {
     PADDLE_ENFORCE_LT(j, Outputs(out).size(),
-                      "Output %s of operator %s only has %d elements.", out,
-                      op_.Type(), Outputs(out).size());
+                      platform::errors::InvalidArgument(
+                          "The output variable index is out of range, output "
+                          "variable %s of operator %s only has %d elements.",
+                          out, op_.Type(), Outputs(out).size()));
     PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName,
-                      "Output %s[%d] of operator %s is @EMPTY@", out,
-                      op_.Type(), j);
+                      platform::errors::InvalidArgument(
+                          "The output variable %s[%d] of operator %s is empty.",
+                          out, j, op_.Type()));
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
     PADDLE_ENFORCE_NOT_NULL(
-        out_var, "Output %s[%d] of operator %s should not be nullptr.", out,
-        op_.Type(), j);
+        out_var, platform::errors::NotFound(
+                     "The output variable %s[%d] of operator %s is not found.",
+                     out, j, op_.Type()));
     if (lod_level >= 0) {
       out_var->SetLoDLevel(lod_level);
     }
@@ -200,8 +234,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   DDim GetInputDim(const std::string &name) const override {
     const std::vector<std::string> &arg_names = Inputs(name);
     PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                      "Input(%s) should hold one element, but now it holds %d",
-                      name, arg_names.size());
+                      platform::errors::InvalidArgument(
+                          "The input(%s) should hold only one element, but now "
+                          "it holds %d elements.",
+                          name, arg_names.size()));
     return this->GetDim(arg_names[0]);
   }
 
@@ -225,8 +261,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   void SetOutputDim(const std::string &name, const DDim &dim) override {
     auto arg_names = Outputs(name);
     PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                      "Output(%s) should hold one element, but now it holds %d",
-                      name, arg_names.size());
+                      platform::errors::InvalidArgument(
+                          "The iutput(%s) should hold only one element, but "
+                          "now it holds %d elements.",
+                          name, arg_names.size()));
     SetDim(arg_names[0], dim);
   }
 
@@ -252,7 +290,8 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   DDim GetDim(const std::string &name) const {
     auto var = block_.FindVarRecursive(name);
-    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found.", name));
     DDim res;
     try {
       auto shape = var->GetShape();
@@ -278,7 +317,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   void SetDims(const std::vector<std::string> &names,
                const std::vector<DDim> &dims) {
     size_t length = names.size();
-    PADDLE_ENFORCE_EQ(length, dims.size());
+    PADDLE_ENFORCE_EQ(length, dims.size(),
+                      platform::errors::InvalidArgument(
+                          "The input variables number(%d) and input dimensions "
+                          "number(%d) do not match.",
+                          length, dims.size()));
     for (size_t i = 0; i < length; ++i) {
       if (names[i] == framework::kEmptyVarName) {
         continue;
@@ -364,8 +407,10 @@ proto::OpDesc *OpDesc::Proto() {
 
 const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
   auto it = inputs_.find(name);
-  PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
-                 Type());
+  PADDLE_ENFORCE_NE(
+      it, inputs_.end(),
+      platform::errors::NotFound("Input %s cannot be found in operator %s.",
+                                 name, Type()));
   return it->second;
 }
 
@@ -385,8 +430,10 @@ void OpDesc::SetInput(const std::string &param_name,
 
 const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
   auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
-                 name, Type());
+  PADDLE_ENFORCE_NE(
+      it, outputs_.end(),
+      platform::errors::NotFound("Output %s cannot be found in operator %s.",
+                                 name, Type()));
   return it->second;
 }
 
@@ -427,7 +474,8 @@ bool OpDesc::HasProtoAttr(const std::string &name) const {
 
 proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
+                                          "Attribute %s is not found.", name));
   return static_cast<proto::AttrType>(it->second.which() - 1);
 }
 
@@ -492,7 +540,8 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
         return;
       }
       default:
-        PADDLE_THROW("Wrong attr type %d", attr.type());
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported attribute type (code %d).", attr.type()));
     }
     need_update_ = true;
     return;
@@ -529,7 +578,8 @@ void OpDesc::SetAttrMap(
 
 Attribute OpDesc::GetAttr(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
+                                          "Attribute %s is not found.", name));
   return it->second;
 }
 
@@ -543,7 +593,8 @@ const proto::OpProto::Attr &OpDesc::GetProtoAttr(
     }
   }
 
-  PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type());
+  PADDLE_THROW(platform::errors::NotFound(
+      "Attribute %s is not found in proto %s.", name, proto.type()));
 }
 
 Attribute OpDesc::GetNullableAttr(const std::string &name) const {
@@ -557,7 +608,10 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
 
 std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound(
+          "Attribute `%s` is not found in operator `%s`.", name, desc_.type()));
   auto blocks = BOOST_GET_CONST(std::vector<BlockDesc *>, it->second);
 
   std::vector<int> ids;
@@ -570,7 +624,10 @@ std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
 
 int OpDesc::GetBlockAttrId(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound(
+          "Attribute `%s` is not found in operator `%s`.", name, desc_.type()));
   return BOOST_GET_CONST(BlockDesc *, it->second)->ID();
 }
 
@@ -657,7 +714,11 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
     VectorToRepeated(v, attr_->mutable_longs());
   }
 
-  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
+  void operator()(boost::blank) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method of SetAttrDescVisitor object for "
+        "`boosst::blank` type."));
+  }
 };
 
 void OpDesc::Flush() {
@@ -691,8 +752,9 @@ void OpDesc::Flush() {
 }
 
 void OpDesc::CheckAttrs() {
-  PADDLE_ENFORCE(!Type().empty(),
-                 "CheckAttr() can not be called before type is set.");
+  PADDLE_ENFORCE_EQ(Type().empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "CheckAttrs() can not be called before type is set."));
   auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
   if (checker == nullptr) {
     // checker is not configured. That operator could be generated by Paddle,
@@ -707,8 +769,10 @@ void OpDesc::InferShape(const BlockDesc &block) const {
   try {
     VLOG(3) << "CompileTime infer shape on " << Type();
     auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
-    PADDLE_ENFORCE(static_cast<bool>(infer_shape),
-                   "%s's infer_shape has not been registered", this->Type());
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(infer_shape), true,
+        platform::errors::NotFound(
+            "Operator %s's infer_shape is not registered.", this->Type()));
     CompileTimeInferShapeContext ctx(*this, block);
     if (VLOG_IS_ON(10)) {
       std::ostringstream sout;
@@ -758,10 +822,10 @@ bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
   if (length == 0) {
     return false;
   }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Input(%s) should have only one value, "
-                    "but it have %d now",
-                    name, length);
+  PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
+                                     "Input(%s) should have only one value, "
+                                     "but it has %d values now.",
+                                     name, length));
   return block_.HasVarRecursive(input_names[0]);
 }
 
@@ -774,10 +838,10 @@ bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
   if (length == 0) {
     return false;
   }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Output(%s) should have only one value, "
-                    "but it have %d now",
-                    name, length);
+  PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
+                                     "Output(%s) should have only one value, "
+                                     "but it has %d values now.",
+                                     name, length));
   return block_.HasVarRecursive(output_names[0]);
 }
 
@@ -826,7 +890,8 @@ std::vector<std::string> CompileTimeInferShapeContext::Outputs(
 std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
     const std::string &name) const {
   auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound("Variable %s is not found.", name));
   std::vector<DDim> res;
   try {
     auto shapes = var->GetShapes();
@@ -848,7 +913,8 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
 void CompileTimeInferShapeContext::SetRepeatedDims(
     const std::string &name, const std::vector<DDim> &dims) {
   auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound("Variable %s is not found.", name));
   std::vector<std::vector<int64_t>> dim_vec(dims.size());
   std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize<>);
   var->SetShapes(dim_vec);
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 0f842637a58e0897e8b68fe06d1e712ffd20ad97..d8159d6a5c294b85d8d5ab9bbee3b95a5eba793f 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
 
+#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
                               customized_name,                     \
                               customized_type_value,               \
@@ -298,6 +301,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
+#define REGISTER_OP_XPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, XPU, ::paddle::platform::XPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
+
 /**
  * Macro to mark what Operator and Kernel
  * we will use and tell the compiler to
diff --git a/paddle/fluid/framework/op_version_registry.cc b/paddle/fluid/framework/op_version_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..11b7224e683402264573019e1541c5645a3a7514
--- /dev/null
+++ b/paddle/fluid/framework/op_version_registry.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..79b15fc87d0b0a0ade8324710b80af634ff8878f
--- /dev/null
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <boost/any.hpp>
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace compatible {
+
+struct OpUpdateRecord {
+  enum class Type {
+    kInvalid = 0,
+    kModifyAttr,
+    kNewAttr,
+    kNewInput,
+    kNewOutput,
+    kBugfixWithBehaviorChanged,
+  };
+  Type type_;
+  std::string remark_;
+};
+
+struct ModifyAttr : OpUpdateRecord {
+  ModifyAttr(const std::string& name, const std::string& remark,
+             const boost::any& default_value)
+      : OpUpdateRecord({Type::kModifyAttr, remark}),
+        name_(name),
+        default_value_(default_value) {
+    // TODO(Shixiaowei02): Check the data type with proto::OpDesc.
+  }
+
+ private:
+  std::string name_;
+  boost::any default_value_;
+};
+
+struct NewAttr : OpUpdateRecord {
+  NewAttr(const std::string& name, const std::string& remark,
+          const boost::any& default_value)
+      : OpUpdateRecord({Type::kNewAttr, remark}),
+        name_(name),
+        default_value_(default_value) {}
+
+ private:
+  std::string name_;
+  boost::any default_value_;
+};
+
+struct NewInput : OpUpdateRecord {
+  NewInput(const std::string& name, const std::string& remark)
+      : OpUpdateRecord({Type::kNewInput, remark}), name_(name) {}
+
+ private:
+  std::string name_;
+};
+
+struct NewOutput : OpUpdateRecord {
+  NewOutput(const std::string& name, const std::string& remark)
+      : OpUpdateRecord({Type::kNewOutput, remark}), name_(name) {}
+
+ private:
+  std::string name_;
+};
+
+struct BugfixWithBehaviorChanged : OpUpdateRecord {
+  explicit BugfixWithBehaviorChanged(const std::string& remark)
+      : OpUpdateRecord({Type::kBugfixWithBehaviorChanged, remark}) {}
+};
+
+class OpVersionDesc {
+ public:
+  OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark,
+                            boost::any default_value) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::ModifyAttr(name, remark, default_value)));
+    return *this;
+  }
+
+  OpVersionDesc& NewAttr(const std::string& name, const std::string& remark,
+                         boost::any default_value) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewAttr(name, remark, default_value)));
+    return *this;
+  }
+
+  OpVersionDesc& NewInput(const std::string& name, const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewInput(name, remark)));
+    return *this;
+  }
+
+  OpVersionDesc& NewOutput(const std::string& name, const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewOutput(name, remark)));
+    return *this;
+  }
+
+  OpVersionDesc& BugfixWithBehaviorChanged(const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::BugfixWithBehaviorChanged(remark)));
+    return *this;
+  }
+
+ private:
+  std::vector<std::shared_ptr<OpUpdateRecord>> infos_;
+};
+
+class OpVersion {
+ public:
+  OpVersion& AddCheckpoint(const std::string& note,
+                           const OpVersionDesc& op_version_desc) {
+    checkpoints_.push_back(Checkpoint({note, op_version_desc}));
+    return *this;
+  }
+
+ private:
+  struct Checkpoint {
+    std::string note_;
+    OpVersionDesc op_version_desc_;
+  };
+  std::vector<Checkpoint> checkpoints_;
+};
+
+class OpVersionRegistrar {
+ public:
+  static OpVersionRegistrar& GetInstance() {
+    static OpVersionRegistrar instance;
+    return instance;
+  }
+  OpVersion& Register(const std::string& op_type) {
+    if (op_version_map_.find(op_type) != op_version_map_.end()) {
+      PADDLE_THROW("'%s' is registered in operator version more than once.",
+                   op_type);
+    }
+    op_version_map_.insert({op_type, OpVersion()});
+    return op_version_map_[op_type];
+  }
+
+ private:
+  std::unordered_map<std::string, OpVersion> op_version_map_;
+
+  OpVersionRegistrar() = default;
+  OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
+};
+
+}  // namespace compatible
+}  // namespace framework
+}  // namespace paddle
+
+#define REGISTER_OP_VERSION(op_type)                                       \
+  static paddle::framework::compatible::OpVersion                          \
+      RegisterOpVersion__##op_type =                                       \
+          paddle::framework::compatible::OpVersionRegistrar::GetInstance() \
+              .Register(#op_type)
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80ad51ad07b5a84cfabb3ace9b478b1f6ea24f95
--- /dev/null
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace compatible {
+
+TEST(test_operator_version, test_operator_version) {
+  REGISTER_OP_VERSION(test__)
+      .AddCheckpoint(
+          R"ROC(Fix the bug of reshape op, support the case of axis < 0)ROC",
+          framework::compatible::OpVersionDesc().BugfixWithBehaviorChanged(
+              "Support the case of axis < 0"))
+      .AddCheckpoint(
+          R"ROC(
+        Upgrade reshape, modified one attribute [axis] and add a new attribute [size].
+      )ROC",
+          framework::compatible::OpVersionDesc()
+              .ModifyAttr("axis",
+                          "Increased from the original one method to two.", -1)
+              .NewAttr("size",
+                       "In order to represent a two-dimensional rectangle, the "
+                       "parameter size is added.",
+                       0))
+      .AddCheckpoint(
+          R"ROC(
+        Add a new attribute [height]
+      )ROC",
+          framework::compatible::OpVersionDesc().NewAttr(
+              "height",
+              "In order to represent a two-dimensional rectangle, the "
+              "parameter height is added.",
+              0))
+      .AddCheckpoint(
+          R"ROC(
+        Add a input [X2] and a output [Y2]
+      )ROC",
+          framework::compatible::OpVersionDesc()
+              .NewInput("X2", "The second input.")
+              .NewOutput("Y2", "The second output."));
+}
+}  // namespace compatible
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index c8c18bcee6a8868919c584527c088725c1c9d58d..ca2705f154c4f45dfccd954b23209c71701adce5 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -34,6 +34,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -165,6 +168,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
       auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
       platform::SetDeviceId(dev_id);
+#endif
+    } else if (platform::is_xpu_place(place)) {
+#ifndef PADDLE_WITH_XPU
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Cannot run operator on place %s", place));
+#else
+      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+      platform::SetXPUDeviceId(dev_id);
 #endif
     }
 
@@ -1109,6 +1120,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
     expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
     kernel_iter = kernels.find(expected_kernel_key);
   }
+#endif
+#ifdef PADDLE_WITH_XPU
+  if (kernel_iter == kernels.end() &&
+      is_xpu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing XPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
   if (kernel_iter == kernels.end()) {
     PADDLE_THROW("op %s does not have kernel for %s", type_,
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 8c6dd628bb9748bb120c1c39841e199659fb53fc..12e0f97f1262ca0f6bf8fc70ab5b482fb0bdd305 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -449,6 +449,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                    const BuildStrategy &build_strategy,
                                    ir::Graph *graph)
     : member_(new ParallelExecutorPrivate(places, scope)) {
+  PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
+                 platform::errors::Unavailable(
+                     "XPU is not supported in ParallelExecutor"));
   ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                  member_->places_.size());
   member_->use_cuda_ = exec_strategy.use_cuda_;
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index a56ca342ad1a867ee768d70c95e3f96c76ceda4a..c3626c5c9e0506f12ca77aac5086cb18e272a771 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -54,18 +54,43 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
   }
+#ifdef PADDLE_WITH_XPU
+  else if (platform::is_xpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_xpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_xpu_place(src_place) &&
+             platform::is_xpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_CUDA
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
                  size);
-  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
-             platform::is_cuda_pinned_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
-  } else if (platform::is_gpu_place(src_place) &&  // NOLINT
-             platform::is_cpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
@@ -75,8 +100,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
@@ -86,8 +112,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
-  } else if (platform::is_gpu_place(src_place) &&  // NOLINT
-             platform::is_cuda_pinned_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_cuda_pinned_place =
         BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place);
@@ -108,8 +135,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size,
                  stream);
-  } else if (platform::is_cuda_pinned_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_cuda_pinned_place =
         BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
@@ -130,8 +158,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size,
                  stream);
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
@@ -154,7 +183,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
       }
     }
-  } else {
+  }
+  else {  // NOLINT
     PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
   }
 #endif
@@ -200,44 +230,74 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
   }
+#ifdef PADDLE_WITH_XPU
+  else if (platform::is_xpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
+             platform::is_xpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_xpu_place(src_place) &&  // NOLINT
+             platform::is_xpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_CUDA
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
                  size);
-  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
-             platform::is_cuda_pinned_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
-  } else if (platform::is_gpu_place(src_place) &&  // NOLINT
-             platform::is_cuda_pinned_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CUDAPlace, src_place), src_ptr, size,
                  nullptr);
-  } else if (platform::is_gpu_place(src_place) &&  // NOLINT
-             platform::is_cpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
     memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_cuda_pinned_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_pinned_place =
         BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
                  nullptr);
-  } else {
+  }
+  else {  // NOLINT
     PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
   }
 #endif
@@ -276,6 +336,19 @@ class AnyVisitor : public boost::static_visitor<bool> {
   const framework::Tensor& tensor_;
   Predicate predicate_;
 
+  bool GetResultHelper(const framework::Tensor& out,
+                       const platform::Place& place) const {
+    platform::CPUPlace cpu;
+    framework::Tensor tmp;
+    tmp.Resize({1});
+    tmp.mutable_data<bool>(cpu);
+    auto ctx = platform::DeviceContextPool::Instance().Get(place);
+    ctx->Wait();
+    TensorCopy(out, cpu, *ctx, &tmp);
+    ctx->Wait();
+    return GetResult(tmp, cpu);
+  }
+
  public:
   AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
       : tensor_(tensor), predicate_(std::move(predicate)) {}
@@ -290,17 +363,14 @@ class AnyVisitor : public boost::static_visitor<bool> {
     return this->GetResult(out, place);
   }
 
+  bool GetResult(const framework::Tensor& out,
+                 const platform::XPUPlace& xpu) const {
+    return GetResultHelper(out, xpu);
+  }
+
   bool GetResult(const framework::Tensor& out,
                  const platform::CUDAPlace& gpu) const {
-    platform::CPUPlace cpu;
-    framework::Tensor tmp;
-    tmp.Resize({1});
-    tmp.mutable_data<bool>(cpu);
-    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
-    gpuctx->Wait();
-    TensorCopy(out, cpu, *gpuctx, &tmp);
-    gpuctx->Wait();
-    return GetResult(tmp, cpu);
+    return GetResultHelper(out, gpu);
   }
 
   bool GetResult(const framework::Tensor& out,
@@ -350,6 +420,61 @@ inline void Any(const framework::Tensor& tensor, Predicate predicate,
   platform::VisitPlace(place, visitor);
 }
 
+template <typename Predicate, typename DevCtx>
+struct AllDTypeVisitor {
+  Predicate predicate_;
+  const Tensor& tensor_;
+  const DevCtx& ctx_;
+  Tensor* out_;
+
+  AllDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
+                  Tensor* out)
+      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
+
+  template <typename T>
+  void apply() const {
+    auto t = EigenVector<T>::Flatten(tensor_);
+    auto o = EigenVector<bool>::Flatten(*out_);
+    o.device(*ctx_.eigen_device()) = predicate_(t);
+  }
+};
+
+template <typename Predicate, typename DevCtx>
+inline void AllImpl(Predicate predicate, const framework::Tensor& tensor,
+                    const DevCtx& ctx, framework::Tensor* out) {
+  VisitDataType(tensor.type(), AllDTypeVisitor<Predicate, DevCtx>(
+                                   predicate, tensor, ctx, out));
+}
+
+template <typename Predicate>
+class AllOutVisitor : public boost::static_visitor<> {
+ private:
+  const framework::Tensor& tensor_;
+  mutable framework::Tensor* out_;
+  Predicate predicate_;
+
+ public:
+  AllOutVisitor(const framework::Tensor& tensor, Predicate predicate,
+                framework::Tensor* out)
+      : tensor_(tensor), out_(out), predicate_(predicate) {}
+
+  template <typename Place>
+  void operator()(const Place& place) const {
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    out_->Resize(tensor_.dims());
+    out_->mutable_data<bool>(place);
+    AllImpl(predicate_, tensor_, *ctx, out_);
+  }
+};
+
+template <typename Predicate>
+inline void All(const framework::Tensor& tensor, Predicate predicate,
+                framework::Tensor* out) {
+  AllOutVisitor<Predicate> visitor(tensor, predicate, out);
+  auto place = tensor.place();
+  platform::VisitPlace(place, visitor);
+}
+
 struct ContainsNANPredicate {
   template <typename T>
   auto operator()(const T& eigen_vec) const
@@ -370,6 +495,12 @@ void TensorContainsNAN(const framework::Tensor& tensor,
   Any(tensor, predicate, out);
 }
 
+void TensorContainsNANV2(const framework::Tensor& tensor,
+                         framework::Tensor* out) {
+  ContainsNANPredicate predicate;
+  All(tensor, predicate, out);
+}
+
 struct ContainsInfPredicate {
   template <typename T>
   auto operator()(const T& eigen_vec) const
@@ -390,6 +521,12 @@ void TensorContainsInf(const framework::Tensor& tensor,
   Any(tensor, predicate, out);
 }
 
+void TensorContainsInfV2(const framework::Tensor& tensor,
+                         framework::Tensor* out) {
+  ContainsInfPredicate predicate;
+  All(tensor, predicate, out);
+}
+
 // NOTE(dzhwinter):
 // Isfinite need a AllVisitor to loop through all the elements.
 // We choose two cuda call instead of one allvisitor. The AllVisitor
@@ -402,8 +539,8 @@ bool TensorIsfinite(const framework::Tensor& tensor) {
 
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
-static inline void __global__ BothFalse(const T* cmp, T* out) {
-  out[0] = (!cmp[0]) && (!out[0]);
+static inline void __global__ BothFalse(const T* cmp, T* out, int element_num) {
+  CUDA_KERNEL_LOOP(i, element_num) { out[i] = (!cmp[i]) && (!out[i]); }
 }
 #endif
 
@@ -418,25 +555,47 @@ struct BothFalseVisitor : public boost::static_visitor<> {
     VisitorImpl(place);
   }
 
+  void VisitorImpl(const platform::XPUPlace& xpu) const {
+    PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+  }
+
   void VisitorImpl(const platform::CUDAPlace& gpu) const {
 #ifdef PADDLE_WITH_CUDA
     auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
-    BothFalse<bool><<<1, 1, 0, ctx->stream()>>>(in_.data<bool>(),
-                                                out_->mutable_data<bool>(gpu));
+    constexpr int MAX_BLOCK_DIM = 512;
+    const int MAX_GRID_DIM = ctx->GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+    int element_num = in_.numel();
+    int block_size = (element_num >= MAX_BLOCK_DIM)
+                         ? MAX_BLOCK_DIM
+                         : (1 << static_cast<int>(std::log2(element_num)));
+    int grid_size = element_num / block_size;
+    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
+    BothFalse<bool><<<grid_size, block_size, 0, ctx->stream()>>>(
+        in_.data<bool>(), out_->mutable_data<bool>(gpu), element_num);
 #endif
   }
 
   void VisitorImpl(const platform::CPUPlace& cpu) const {
-    bool lhs = !in_.data<bool>()[0];
-    bool rhs = !out_->mutable_data<bool>(cpu)[0];
-    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
+    int num = in_.numel();
+    const bool* in_ptr = in_.data<bool>();
+    bool* out_ptr = out_->data<bool>();
+    for (int i = 0; i < num; ++i) {
+      bool lhs = !in_ptr[i];
+      bool rhs = !out_ptr[i];
+      out_ptr[i] = lhs && rhs;
+    }
   }
 
   void VisitorImpl(
       const platform::CUDAPinnedPlace& cpu /* equals to cpu*/) const {
-    bool lhs = !in_.data<bool>()[0];
-    bool rhs = !out_->mutable_data<bool>(cpu)[0];
-    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
+    int num = in_.numel();
+    const bool* in_ptr = in_.data<bool>();
+    bool* out_ptr = out_->data<bool>();
+    for (int i = 0; i < num; ++i) {
+      bool lhs = !in_ptr[i];
+      bool rhs = !out_ptr[i];
+      out_ptr[i] = lhs && rhs;
+    }
   }
 };
 
@@ -449,6 +608,15 @@ void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
   platform::VisitPlace(place, visitor);
 }
 
+void TensorIsfiniteV2(const framework::Tensor& tensor, framework::Tensor* out) {
+  framework::Tensor tmp;
+  TensorContainsInfV2(tensor, &tmp);
+  TensorContainsNANV2(tensor, out);
+  BothFalseVisitor visitor(tmp, out);
+  auto place = tensor.place();
+  platform::VisitPlace(place, visitor);
+}
+
 void TensorToStream(std::ostream& os, const Tensor& tensor,
                     const platform::DeviceContext& dev_ctx) {
   {  // the 1st field, uint32_t version
@@ -498,6 +666,28 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "CUDAPlace is not supported when not compiled with CUDA"));
+#endif
+    } else if (platform::is_xpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_XPU
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& xpu_dev_ctx =
+          static_cast<const platform::XPUDeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     BOOST_GET_CONST(platform::XPUPlace, tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write);
+        xpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "XPUPlace is not supported when not compiled with XPU"));
 #endif
     } else {
       os.write(static_cast<const char*>(data_ptr),
@@ -552,8 +742,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     void* buf;
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
-    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
+        platform::is_xpu_place(dev_ctx.GetPlace())) {
+#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(shape));
       framework::VisitDataType(
@@ -563,8 +754,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
 #else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "CUDAPlace is not supported when not compiled with CUDA"));
+      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "CUDAPlace is not supported when not compiled with CUDA"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "XPUPlace is not supported when not compiled with XPU"));
+      }
 #endif
     } else {
       framework::VisitDataType(
@@ -603,8 +799,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     void* buf;
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
-    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
+        platform::is_xpu_place(dev_ctx.GetPlace())) {
+#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(dims));
       framework::VisitDataType(
@@ -614,8 +811,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
 #else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "CUDAPlace is not supported when not compiled with CUDA"));
+      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "CUDAPlace is not supported when not compiled with CUDA"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "XPUPlace is not supported when not compiled with XPU"));
+      }
 #endif
     } else {
       framework::VisitDataType(
@@ -700,6 +902,9 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
         reinterpret_cast<const platform::CUDADeviceContext&>(*ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_XPU
+  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+#endif
 }
 
 template <typename T>
@@ -708,10 +913,20 @@ std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
   auto element_num = tensor.numel();
 
   os << "  - data: [";
-  if (element_num > 0) {
-    os << inspect[0];
-    for (int j = 1; j < element_num; ++j) {
-      os << " " << inspect[j];
+  // Note: int8_t && uint8_t is typedf of char, ostream unable to print properly
+  if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) {
+    if (element_num > 0) {
+      os << signed(inspect[0]);
+      for (int j = 1; j < element_num; ++j) {
+        os << " " << signed(inspect[j]);
+      }
+    }
+  } else {
+    if (element_num > 0) {
+      os << inspect[0];
+      for (int j = 1; j < element_num; ++j) {
+        os << " " << inspect[j];
+      }
     }
   }
   os << "]";
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index c71327da64042aed85f1247f3c31de3e66a588ba..fce0142b41d3ae9b2a6fcd4f16d38b0492fbd806 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -76,6 +76,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
                       const platform::DeviceContext& dev_ctx,
                       const size_t& seek, const std::vector<int64_t>& shape);
 
+// store the bool result tensor in out tensor
+void TensorContainsNANV2(const framework::Tensor& tensor,
+                         framework::Tensor* out);
+void TensorContainsInfV2(const framework::Tensor& tensor,
+                         framework::Tensor* out);
+void TensorIsfiniteV2(const framework::Tensor& tensor, framework::Tensor* out);
+
 // convert dlpack's DLTensor to tensor
 void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst);
 
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index de1246883f1019bc3e6adabadbc9e071926eb772..9ad30506b2c3a0fac16d29c3bbee07725ff3d95d 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -30,12 +30,12 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/profiler.h"
 
+DECLARE_bool(sort_sum_gradient);
+
 namespace paddle {
 namespace imperative {
 
-void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy,
-                       bool retain_graph) {
-  backward_strategy_ = strategy;
+void BasicEngine::Init(VarBase* var, bool retain_graph) {
   retain_graph_ = retain_graph;
   init_node_ = var->GradVarBase()->GradNode();
   var->GradVarBase()->ClearGradNode();
@@ -105,7 +105,7 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) {
 
       auto& accumulator = accumulators_[var.get()];
       if (!accumulator) {
-        if (backward_strategy_.sorted_sum_gradient_) {
+        if (FLAGS_sort_sum_gradient) {
           accumulator.reset(new SortedGradientAccumulator(var.get()));
         } else {
           accumulator.reset(new EagerGradientAccumulator(var.get()));
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index 4d25d81235098cca37491b1d8e43b481adc2fd0a..0906dd4f9236ecf26ef30395aa551b57e4e43b75 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -18,7 +18,6 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "paddle/fluid/imperative/backward_strategy.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 
@@ -30,8 +29,7 @@ class OpBase;
 
 class BasicEngine : public Engine {
  public:
-  void Init(VarBase* var, const detail::BackwardStrategy& strategy,
-            bool retain_graph = false);
+  void Init(VarBase* var, bool retain_graph = false);
 
   void Execute() override;
 
@@ -46,7 +44,6 @@ class BasicEngine : public Engine {
 
  private:
   std::shared_ptr<GradOpNode> init_node_;
-  detail::BackwardStrategy backward_strategy_;
   std::unordered_map<GradOpNode*, size_t> node_deps_;
   std::unordered_map<VariableWrapper*, std::unique_ptr<GradientAccumulator>>
       accumulators_;
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index f5fc5944709fc94ef23b878a5f58c9cb1dfed63a..7caeb4378ce3d1ca1d1557054642c9fa184bea39 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -76,6 +76,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
     blas.AXPY(numel_, 1., x_, y_);
   }
 
+  void operator()(const platform::XPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+
 #ifdef PADDLE_WITH_CUDA
   void operator()(const platform::CUDAPlace& place) {
     platform::CUDADeviceContext* ctx =
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 4f133bf80c7904d9b6a84c933d431c2820b999e4..5c717835e5cc2042a7a3fdd8c51aa6eeff1fc523 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -33,6 +33,8 @@
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
 
+DECLARE_bool(sort_sum_gradient);
+
 namespace paddle {
 namespace imperative {
 
@@ -529,8 +531,7 @@ class PartialGradTask {
                   const std::vector<std::shared_ptr<VarBase>> &output_targets,
                   const std::vector<std::shared_ptr<VarBase>> &output_grads,
                   const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-                  const platform::Place &place,
-                  const detail::BackwardStrategy &strategy, bool create_graph,
+                  const platform::Place &place, bool create_graph,
                   bool retain_graph, bool allow_unused, bool only_inputs);
 
   std::vector<std::shared_ptr<VarBase>> Run();
@@ -577,7 +578,6 @@ class PartialGradTask {
   bool retain_graph_;
   bool allow_unused_;
   bool only_inputs_;
-  detail::BackwardStrategy strategy_;
 };
 
 PartialGradTask::PartialGradTask(
@@ -585,15 +585,14 @@ PartialGradTask::PartialGradTask(
     const std::vector<std::shared_ptr<VarBase>> &output_targets,
     const std::vector<std::shared_ptr<VarBase>> &output_grads,
     const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-    const platform::Place &place, const detail::BackwardStrategy &strategy,
-    bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs) {
+    const platform::Place &place, bool create_graph, bool retain_graph,
+    bool allow_unused, bool only_inputs) {
   input_targets_ = input_targets;
   place_ = place;
   create_graph_ = create_graph;
   retain_graph_ = retain_graph;
   allow_unused_ = allow_unused;
   only_inputs_ = only_inputs;
-  strategy_ = strategy;
 
   PADDLE_ENFORCE_EQ(only_inputs_, true,
                     platform::errors::Unimplemented(
@@ -981,7 +980,7 @@ void PartialGradTask::PrepareInitialGradientAccumulators(const OpBase *op) {
 
       if (!accumulator) {
         accumulator.reset(new GradientAccumulationInfo(
-            var, strategy_.sorted_sum_gradient_, create_graph_));
+            var, FLAGS_sort_sum_gradient, create_graph_));
       }
 
       accumulator->IncreaseTotalRefCnt();
@@ -1033,11 +1032,11 @@ PartialGradEngine::PartialGradEngine(
     const std::vector<std::shared_ptr<VarBase>> &output_targets,
     const std::vector<std::shared_ptr<VarBase>> &output_grads,
     const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-    const platform::Place &place, const detail::BackwardStrategy &strategy,
-    bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs)
+    const platform::Place &place, bool create_graph, bool retain_graph,
+    bool allow_unused, bool only_inputs)
     : task_(new PartialGradTask(input_targets, output_targets, output_grads,
-                                no_grad_vars, place, strategy, create_graph,
-                                retain_graph, allow_unused, only_inputs)) {}
+                                no_grad_vars, place, create_graph, retain_graph,
+                                allow_unused, only_inputs)) {}
 
 PartialGradEngine::~PartialGradEngine() { Clear(); }
 
diff --git a/paddle/fluid/imperative/partial_grad_engine.h b/paddle/fluid/imperative/partial_grad_engine.h
index a7f28c49ec3950674cd43127f51934089a497412..b5da39f8d4237130fd4674eacb479aaf6b9ba348 100644
--- a/paddle/fluid/imperative/partial_grad_engine.h
+++ b/paddle/fluid/imperative/partial_grad_engine.h
@@ -16,7 +16,6 @@
 
 #include <memory>
 #include <vector>
-#include "paddle/fluid/imperative/backward_strategy.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -33,8 +32,7 @@ class PartialGradEngine : public Engine {
                     const std::vector<std::shared_ptr<VarBase>> &output_targets,
                     const std::vector<std::shared_ptr<VarBase>> &output_grads,
                     const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-                    const platform::Place &place,
-                    const detail::BackwardStrategy &strategy, bool create_graph,
+                    const platform::Place &place, bool create_graph,
                     bool retain_graph, bool allow_unused, bool only_inputs);
 
   ~PartialGradEngine();
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 0336325bef6bd71208a839478741490ec4e10b66..4e0e95dd012976c292b4511e9707802c210dc599 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -100,6 +100,13 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_XPU
+  if (kernel_iter == kernels.end() &&
+      is_xpu_place(expected_kernel_key.place_)) {
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
   // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
   PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                     platform::errors::NotFound(
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 3c3ec2e6263396881597649d3ab643b5492d630a..892acffb712d9734e525a403881fda47ca0df23a 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -240,9 +240,8 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
   framework::AttributeMap reduce_attr_map;
   tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
                  gpu_place, true);
-  detail::BackwardStrategy back_st;
   imperative::BasicEngine engine;
-  engine.Init(reduce_sum_out.get(), back_st);
+  engine.Init(reduce_sum_out.get());
   engine.Execute();
 
   framework::LoDTensor rlt;
@@ -356,9 +355,8 @@ TEST(test_tracer, test_var_without_grad_var) {
   ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
 
-  detail::BackwardStrategy back_st;
   imperative::BasicEngine engine;
-  engine.Init(vout.get(), back_st);
+  engine.Init(vout.get());
   engine.Execute();
 
   // check the grad
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 1f2734eece578f7ec266a6f31cd46b373f010fc1..98554ed04976670c1a846cbeab69815417c0a998 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -64,10 +64,9 @@ if (NOT APPLE AND NOT WIN32)
     SRCS analyzer_tester.cc
     EXTRA_DEPS reset_tensor_array paddle_fluid_shared
     ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
-elseif(NOT WIN32)
-  # TODO: Fix this unittest failed on Windows
-  inference_analysis_test(test_analyzer
-    SRCS analyzer_tester.cc
-    EXTRA_DEPS reset_tensor_array paddle_inference_api
-    ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
+elseif(WIN32)
+    inference_analysis_test(test_analyzer
+      SRCS analyzer_tester.cc
+      EXTRA_DEPS reset_tensor_array paddle_inference_api
+      ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
 endif()
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 30e8386f4c86e308372b5dd6328c7d3785a073b1..fb0ad31a3e612201de32813a65970c73b73b611b 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -54,8 +54,7 @@ if(WITH_TESTING)
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
     set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
-  elseif(NOT WIN32)
-    # TODO: Fix this unittest failed on Windows
+  elseif(WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
@@ -67,8 +66,7 @@ endif()
 if (NOT APPLE AND NOT WIN32)
   cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_fluid_shared
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
-elseif (NOT WIN32)
-  # TODO: Fix this unittest failed on Windows
+elseif (WIN32)
   cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 61886c225e6548413e6e2eb0415f596d016a988f..a1c1e6de5fd44617a30f235a0416d897bf932075 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -218,6 +218,17 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
   Update();
 }
 
+void AnalysisConfig::EnableMkldnnBfloat16() {
+#ifdef PADDLE_WITH_MKLDNN
+  use_mkldnn_bfloat16_ = true;
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnBfloat16";
+  use_mkldnn_bfloat16_ = false;
+#endif
+
+  Update();
+}
+
 MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
   PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
                           "MkldnnQuantizer was not enabled yet.");
@@ -331,6 +342,12 @@ void AnalysisConfig::Update() {
 #endif
   }
 
+  if (use_mkldnn_bfloat16_) {
+#ifdef PADDLE_WITH_MKLDNN
+    pass_builder()->EnableMkldnnBfloat16();
+#endif
+  }
+
 #ifdef PADDLE_WITH_MKLDNN
   // Do not optimize when mkldnn is on
   if (enable_memory_optim_ && !use_mkldnn_) {
@@ -399,6 +416,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << ";";
 
   ss << use_mkldnn_quantizer_;
+  ss << use_mkldnn_bfloat16_;
   ss << model_from_memory_;
 
   ss << with_profile_;
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index dea448f9b03468eabda16d4375ea60348a09efb2..5766919f08e68832886b88b867bc48afa288a955 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -485,4 +485,25 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
 }
 #endif
 
+#ifdef PADDLE_WITH_CUDA
+TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim(true);
+  config.EnableUseGpu(100, 0);
+  config.EnableMkldnnBfloat16();
+#ifdef PADDLE_WITH_MKLDNN
+  ASSERT_EQ(config.mkldnn_bfloat16_enabled(), true);
+#else
+  ASSERT_EQ(config.mkldnn_bfloat16_enabled(), false);
+#endif
+}
+#endif
+
+TEST(AnalysisPredictor, bf16_pass_strategy) {
+  std::vector<std::string> passes;
+  PassStrategy passStrategy(passes);
+  passStrategy.EnableMkldnnBfloat16();
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 6a31ff281c68e3675d35c14059a453455ef398df..b1244e4e3dfdd5e6a627054250e6def2a7c35a89 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -401,6 +401,19 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableMkldnnQuantizer();
 
+  ///
+  /// \brief Turn on MKLDNN bfloat16.
+  ///
+  ///
+  void EnableMkldnnBfloat16();
+
+  ///
+  /// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
+  ///
+  /// \return bool Whether to use the MKLDNN Bfloat16.
+  ///
+  bool mkldnn_bfloat16_enabled() const { return use_mkldnn_bfloat16_; }
+
   ///
   /// \brief A boolean state telling whether the thread local CUDA stream is
   /// enabled.
@@ -592,6 +605,7 @@ struct PD_INFER_DECL AnalysisConfig {
   int mkldnn_cache_capacity_{0};
   bool use_mkldnn_quantizer_{false};
   std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
+  bool use_mkldnn_bfloat16_{false};
 
   // If the config is already used on a predictor, it becomes invalid.
   // Any config can only be used with one predictor.
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index c07ac11e278901e9b9475492ca38411dcf8184d3..ffb70700b5f98a51b579a68f746ea1ee6a6d9f7b 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -143,6 +143,10 @@ void GpuPassStrategy::EnableMkldnnQuantizer() {
   LOG(ERROR) << "GPU not support MKL-DNN quantization";
 }
 
+void GpuPassStrategy::EnableMkldnnBfloat16() {
+  LOG(ERROR) << "GPU not support MKL-DNN bfloat16";
+}
+
 CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
   // NOTE the large fusions should be located in the front, so that they will
   // not be damaged by smaller ones.
@@ -223,4 +227,12 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
 #endif
 }
 
+void CpuPassStrategy::EnableMkldnnBfloat16() {
+#ifdef PADDLE_WITH_MKLDNN
+  use_mkldnn_bfloat16_ = true;
+#else
+  use_mkldnn_bfloat16_ = false;
+#endif
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index c5a4a5f754d031a8e8f88a96dd16c89fbe1b0fbb..9073253520466a3711089bc7b7da04a9191e0a42 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -132,6 +132,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \brief Enable MKLDNN quantize optimization.
   virtual void EnableMkldnnQuantizer() {}
 
+  /// \brief Enable MKLDNN bfloat16.
+  virtual void EnableMkldnnBfloat16() {}
+
   /// \brief Check if we are using gpu.
   /// \return A bool variable implying whether we are in gpu mode.
   bool use_gpu() const { return use_gpu_; }
@@ -161,6 +164,7 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
     use_gpu_ = other.use_gpu_;
     use_mkldnn_ = other.use_mkldnn_;
     use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
+    use_mkldnn_bfloat16_ = other.use_mkldnn_bfloat16_;
   }
   /// \brief Default destructor.
   virtual ~CpuPassStrategy() = default;
@@ -174,9 +178,13 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
   /// \brief Enable MKLDNN quantize optimization.
   void EnableMkldnnQuantizer() override;
 
+  /// \brief Enable MKLDNN bfloat16.
+  void EnableMkldnnBfloat16() override;
+
  protected:
   /// \cond Protected
   bool use_mkldnn_quantizer_{false};
+  bool use_mkldnn_bfloat16_{false};
   /// \endcond
 };
 
@@ -205,6 +213,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
   /// \brief Not supported in GPU mode yet.
   void EnableMkldnnQuantizer() override;
 
+  /// \brief Not supported in GPU mode yet.
+  void EnableMkldnnBfloat16() override;
+
   /// \brief Default destructor.
   virtual ~GpuPassStrategy() = default;
 
diff --git a/paddle/fluid/inference/capi/paddle_c_api.h b/paddle/fluid/inference/capi/paddle_c_api.h
index 4be6b48fb1820dc3271de164e87387c73ee67da9..32129890d02a2a0e0b357a6e0402d07b56bc6509 100644
--- a/paddle/fluid/inference/capi/paddle_c_api.h
+++ b/paddle/fluid/inference/capi/paddle_c_api.h
@@ -235,6 +235,12 @@ PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnQuantizer(
 PADDLE_CAPI_EXPORT extern bool PD_MkldnnQuantizerEnabled(
     const PD_AnalysisConfig* config);
 
+PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnBfloat16(
+    PD_AnalysisConfig* config);
+
+PADDLE_CAPI_EXPORT extern bool PD_MkldnnBfloat16Enabled(
+    const PD_AnalysisConfig* config);
+
 PADDLE_CAPI_EXPORT extern void PD_SetModelBuffer(PD_AnalysisConfig* config,
                                                  const char* prog_buffer,
                                                  size_t prog_buffer_size,
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index f5445dd5a3f9b6499045361a36fd6363a79ef560..b99abc06b27ecb9686b4c6e883aaaf8b3e592415 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -207,6 +207,18 @@ bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
   return config->config.mkldnn_quantizer_enabled();
 }
 
+void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
+                                      "PD_AnalysisConfig should not be null"));
+  config->config.EnableMkldnnBfloat16();
+}
+
+bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
+                                      "PD_AnalysisConfig should not be null"));
+  return config->config.mkldnn_bfloat16_enabled();
+}
+
 void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
                        size_t prog_buffer_size, const char* params_buffer,
                        size_t params_buffer_size) {
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 97d09925b19c4911a6b412518dc58fe88da16f64..10c212c0b4fa394e3c745bf524ef9d081c4bc3c1 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -51,7 +51,13 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   if (enable_int8) {
 #if IS_TRT_VERSION_GE(5000)
-    CHECK(op_desc.HasAttr("Input_scale"));
+    if (op_desc.Type() != "conv2d_transpose") {
+      PADDLE_ENFORCE_EQ(
+          op_desc.HasAttr("Input_scale"), true,
+          platform::errors::InvalidArgument("Input scale not found. TRT int8"
+                                            " requires conv/deconv to have "
+                                            "input quantization scales."));
+    }
     float in_scale =
         BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
     auto weight_scale =
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 70ead9720d2ebcb15ae0173dc0ba7c2095a4f4d4..f5d22b982de2b41474d97565a44dedc67c8a85d7 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -24,6 +24,8 @@ struct SimpleOpTypeSetTeller : public Teller {
 #if IS_TRT_VERSION_GE(5130)
     teller_set.insert("relu6");
     teller_set.insert("hard_sigmoid");
+    int8_teller_set.insert("relu6");
+    int8_teller_set.insert("hard_sigmoid");
 #endif
 #if IS_TRT_VERSION_GE(6000)
     teller_set.insert("fused_embedding_eltwise_layernorm");
@@ -53,11 +55,11 @@ struct SimpleOpTypeSetTeller : public Teller {
                                                   "elementwise_add",
                                                   "leaky_relu",
                                                   "fc",
-                                                  "relu6",
                                                   "concat",
                                                   "scale",
                                                   "elementwise_mul",
-                                                  "conv2d_transpose"};
+                                                  "conv2d_transpose",
+                                                  "hard_swish"};
   std::unordered_set<std::string> teller_set{
       "mul",
       "conv2d",
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 9a3a73f6c946d76f0da1b18feb8e7d61c0bf59b6..814deda6729278e2e9f9e76ff83bbdd4966821c1 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -132,6 +132,7 @@ if(NOT APPLE AND WITH_MKLML)
     set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
     download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
     inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
+    set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
 else()
     # TODO: fix this test on MACOS and OPENBLAS, the reason is that
     # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
@@ -192,6 +193,8 @@ inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
     EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
 
+set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
+
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
index c60e0a25f28c01c453276a8ef04eb79b35b7dda2..da0c93d21b7852e06b6805230078540063c2b243 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
@@ -54,6 +54,9 @@ TEST(PD_AnalysisConfig, use_gpu) {
   PD_SwitchIrOptim(config, true);
   bool ir_optim = PD_IrOptim(config);
   CHECK(ir_optim) << "NO";
+  PD_EnableMkldnnBfloat16(config);
+  bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
+  CHECK(!bfloat16_enable) << "NO";
   PD_EnableTensorRtEngine(config, 1 << 20, 1, 3, Precision::kFloat32, false,
                           false);
   bool trt_enable = PD_TensorrtEngineEnabled(config);
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
index 93fcb43447d01dcafa10d8c85234d243d5095d4e..e24706691ed834ac4f49d924162035ec565d24ea 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
@@ -88,6 +88,9 @@ TEST(PD_AnalysisConfig, profile_mkldnn) {
   PD_EnableMkldnnQuantizer(config);
   bool quantizer_enable = PD_MkldnnQuantizerEnabled(config);
   CHECK(quantizer_enable) << "NO";
+  PD_EnableMkldnnBfloat16(config);
+  bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
+  CHECK(bfloat16_enable) << "NO";
   PD_SetMkldnnCacheCapacity(config, 0);
   PD_SetModel(config, prog_file.c_str(), params_file.c_str());
   PD_DeleteAnalysisConfig(config);
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 72816f6d0317600ad6bf8ffc4ad31bd1a23d7c30..b9f979f96d4b106642795151fb8e34b025b2caef 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -32,19 +32,20 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
       ${EXTERNAL_PROJECT_NAME}
       ${EXTERNAL_PROJECT_LOG_ARGS}
       PREFIX                ${INSTALL_DIR}
-      DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
-                            ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
+      URL                   ${URL}/${FILENAME}
       DOWNLOAD_DIR          ${INSTALL_DIR}
+      DOWNLOAD_NO_EXTRACT   1
       DOWNLOAD_NO_PROGRESS  1
       CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ""
+      BUILD_COMMAND         ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR}
+                            ${CMAKE_COMMAND} -E tar xzf ${FILENAME}
       UPDATE_COMMAND        ""
       INSTALL_COMMAND       ""
   )
 endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
-if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32)
+if(NOT EXISTS ${WORD2VEC_INSTALL_DIR})
   inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index bd1908ac65509343530aa57489661637eed72595..9cc7c267454a4dbd4e1f62ec971e4160d6088913 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -23,6 +23,8 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
 if (WITH_GPU)
     set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
+elseif(WITH_XPU)
+    set(AllocatorFacadeDeps xpu_info)
 else ()
     set(AllocatorFacadeDeps)
 endif()
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 2ab0d69ef806155adbff83e523a1242e51c2c7fc..3213684c140b02e1fa4b846cb0448f9bc9d8f3ee 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -39,6 +39,9 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
 
 DEFINE_int64(
     gpu_allocator_retry_time, 10000,
@@ -62,6 +65,11 @@ class AllocatorFacadePrivate {
     switch (strategy) {
       case AllocatorStrategy::kNaiveBestFit: {
         InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
         for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
              ++dev_id) {
@@ -74,6 +82,11 @@ class AllocatorFacadePrivate {
 
       case AllocatorStrategy::kAutoGrowth: {
         InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
         for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
              ++dev_id) {
@@ -86,6 +99,11 @@ class AllocatorFacadePrivate {
 
       case AllocatorStrategy::kThreadLocal: {
         InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
         for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
              ++dev_id) {
@@ -127,6 +145,13 @@ class AllocatorFacadePrivate {
  private:
   void InitSystemAllocators() {
     system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::XPUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+    }
+#endif
 #ifdef PADDLE_WITH_CUDA
     system_allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<CPUPinnedAllocator>();
@@ -164,6 +189,12 @@ class AllocatorFacadePrivate {
   }
 #endif
 
+#ifdef PADDLE_WITH_XPU
+  void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
   class ZeroSizeAllocator : public Allocator {
    public:
     explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
@@ -191,6 +222,12 @@ class AllocatorFacadePrivate {
     }
     places.emplace_back(platform::CUDAPinnedPlace());
 #endif
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      places.emplace_back(platform::XPUPlace(dev_id));
+    }
+#endif
 
     for (auto& p : places) {
       zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 907a266e7b2bcd30e65ca71ab3dbae7f9b110b3b..92e3933a072832fa42520e67f455d3dc90118518 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -29,6 +29,9 @@
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
 
 DEFINE_bool(init_allocated_mem, false,
             "It is a mistake that the values of the memory allocated by "
@@ -101,6 +104,100 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
+template <>
+void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
+#ifdef PADDLE_WITH_XPU
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  void *p = nullptr;
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (FLAGS_init_allocated_mem) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "xpu memory FLAGS_init_allocated_mem is not implemented."));
+  }
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  VLOG(10) << "  pointer=" << p;
+  return p;
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+  return nullptr;
+#endif
+}
+
+template <>
+void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
+                              size_t size) {
+#ifdef PADDLE_WITH_XPU
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  xpu_free(p);
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
+
+template <>
+size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
+#ifdef PADDLE_WITH_XPU
+  printf("Used func return 0 for XPUPlace\n");
+  return 0;
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
+
 #ifdef PADDLE_WITH_CUDA
 class GPUBuddyAllocatorList {
  private:
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index b19f02db1c0ddf17c84536bf5d512bbd823909b2..225b6858cc1f2a5afc9d612958694d0d940e2e7b 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
+
 namespace paddle {
 namespace memory {
 
@@ -29,6 +33,169 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
   std::memcpy(dst, src, num);
 }
 
+#ifdef PADDLE_WITH_XPU
+template <>
+void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != dst_place.device) {
+    ret = xpu_set_device(dst_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id != dst_place.device) {
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+
+template <>
+void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::XPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != src_place.device) {
+    ret = xpu_set_device(src_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id != src_place.device) {
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+
+template <>
+void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::XPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != src_place.device || dev_id != dst_place.device) {
+    ret = xpu_set_device(src_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    void* tmp = malloc(num);
+    ret = xpu_memcpy(tmp, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_set_device(dst_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_memcpy(dst, tmp, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    free(tmp);
+  } else {
+    int ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 48d1ec9461a880c3c888caec36dd69808d070e8b..6e8ff52ed4a8846f5f6060e10cfd9bec22308e9e 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -123,7 +123,7 @@ cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_t
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
-nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
+nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor generator)
 if (WITH_GPU)
     nv_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc test_leaky_relu_grad_grad_functor.cu DEPS tensor device_context eigen3)
 else()
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 3d604fd335ded19d9408f50942a2fd9fe2fbda01..63b3b0f1a3408154a2d1c8aff76a85a95ad044f6 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -317,13 +317,6 @@ $$out = x^2$$
 
 )DOC";
 
-UNUSED constexpr char SoftplusDoc[] = R"DOC(
-Softplus Activation Operator.
-
-$$out = \ln(1 + e^{x})$$
-
-)DOC";
-
 UNUSED constexpr char SoftsignDoc[] = R"DOC(
 Softsign Activation Operator.
 
@@ -396,6 +389,36 @@ $$out = \max(x, \alpha * x)$$
   }
 };
 
+class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "Input of Softplus operator, an N-D Tensor, with data type "
+             "float32, float64 or float16.");
+    AddOutput(
+        "Out",
+        "Output of Softplus operator, a Tensor with shape same as input.");
+    AddAttr<float>("beta", "The value of beta for Softplus.").SetDefault(1.0f);
+    AddAttr<float>("threshold", "The value of threshold for Softplus.")
+        .SetDefault(20.0f);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel.")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "use_cudnn",
+        "(bool, default false) Only used in cudnn kernel, need install cudnn.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+:strong:`Softplus Activation Operator`
+
+..  math::
+    out = \frac{1}{\beta} * \log(1 + \exp(\beta * x)) \\
+    \text{For numerical stability, the implementation reverts to the linear function when :}\,x \times \beta > threshold.
+
+)DOC");
+  }
+};
+
 class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -672,7 +695,6 @@ REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc);
 REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc);
 REGISTER_ACTIVATION_OP_MAKER(Log1p, Log1pDoc);
 REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
-REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
 REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
 
 template <ActBwdOpFwdDeps kDepValue>
@@ -759,8 +781,8 @@ class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
   }
 };
 
-// leaky_relu Grad: dx=dy if y>=0 else alpha * dy
-// leaky_relu GradGrad: ddy=ddx if y>=0 else alpha * ddx
+// leaky_relu Grad: dx=dy if x>=0 else alpha * dy
+// leaky_relu GradGrad: ddy=ddx if x>=0 else alpha * ddx
 template <typename T>
 class LeakyReluDoubleGradMaker
     : public ::paddle::framework::SingleGradOpMaker<T> {
@@ -770,8 +792,8 @@ class LeakyReluDoubleGradMaker
  protected:
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("leaky_relu_grad_grad");
-    // input1: Out
-    op->SetInput("Out", this->Input("Out"));
+    // input1: X
+    op->SetInput("X", this->Input("X"));
     // X@GRAD@GRAD: ddx
     op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
     op->SetAttrMap(this->Attrs());
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 3aac7ae8a5e8a9e889242b59f42a29af08ad1c46..00a7c063c9155488d117332d5ef3541d16d76bdb 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -388,9 +388,9 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
   }
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) const {
-    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>();
-    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>();
-    out.device(d) = x * (temp1 + temp2);
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    out.device(d) = x * (temp1 + temp2 > 0).template cast<T>();
   }
 };
 
@@ -405,9 +405,9 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>();
-    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    dx.device(d) = dout * (temp1 + temp2 > 0).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -975,32 +975,46 @@ struct HardSwishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
-// softplus(x) = log(1 + exp(x))
-// When x is a very large positive number, exp(x) may explode to inf,
-// Using trick below for numerical stability
-// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
-// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0)))
+// For numerical stability, using the following formula instead of softplus(x) =
+// log(1 + exp(x))
+// softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <= threshold(beta =
+// 1, threshold = 20 by default), otherwise x
 template <typename T>
 struct SoftplusFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) {
-    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
-    out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
+    auto x_beta = static_cast<T>(beta) * x;
+    out.device(d) = (x_beta > static_cast<T>(threshold))
+                        .select(x, (static_cast<T>(1) + x_beta.exp()).log() /
+                                       static_cast<T>(beta));
   }
 };
 
-// d(softplus(x))/dx = exp(x) / (1 + exp(x))
-// For numerical stability:
-// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) +
-// exp(x - max(x, 0)))
+// For numerical stability, using the following formula instead of
+// d(softplus(x))/dx = 1 / (1 + exp(-x))
+// d(softplus(x))/dx = 1 / (1 + exp(-beta * x)) when beta * x <= threshold(beta
+// = 1, threshold = 20 by default), otherwise x
 template <typename T>
 struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) {
-    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
+    auto x_beta = static_cast<T>(beta) * x;
     dx.device(d) =
-        dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
+        (x_beta > static_cast<T>(threshold))
+            .select(dout, dout / (static_cast<T>(1) + (-x_beta).exp()));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -1070,7 +1084,11 @@ struct LeakyReluFunctor : public BaseActivationFunctor<T> {
 
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+    if (alpha < 1.f) {
+      out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+    } else {
+      out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
+    }
   }
 };
 
@@ -1084,12 +1102,12 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 =
-        static_cast<T>(alpha) * (out <= static_cast<T>(0)).template cast<T>();
-    auto temp2 = (out > static_cast<T>(0)).template cast<T>();
+        static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
+    auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -1116,9 +1134,20 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>() +
-                   dout * static_cast<T>(alpha) * x.exp() *
-                       (x <= static_cast<T>(0)).template cast<T>();
+    auto temp_a_pos = static_cast<T>(alpha > 0);
+    auto temp_a_neg = static_cast<T>(alpha <= 0);
+    auto temp_x_pos = (x > static_cast<T>(0)).template cast<T>();
+    auto temp_x_neg = (x <= static_cast<T>(0)).template cast<T>();
+
+    // dx = dout, if alpha > 0 and x > 0
+    // dx = dout * alpha * x.exp(), if alpha > 0 and x <= 0
+    // dx = dout * (1 + alpha * x.exp()), if alpha <= 0 and x > 0
+    // dx = 0, if alpha <= 0 and x <=0
+    dx.device(d) =
+        dout * temp_a_pos * temp_x_pos +
+        dout * static_cast<T>(alpha) * x.exp() * temp_a_pos * temp_x_neg +
+        dout * (static_cast<T>(1) + static_cast<T>(alpha) * x.exp()) *
+            temp_a_neg * temp_x_pos;
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -1437,18 +1466,18 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
       auto* d = dev.eigen_device();
       auto ddx = framework::EigenVector<T>::Flatten(
           GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
-      auto out = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(Out, "Output", "Out", "LeakyReluGradGrad"));
+      auto x = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
       auto ddout = framework::EigenVector<T>::Flatten(
           GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
-      ddout.device(*d) = ddx *
-                         ((out > static_cast<T>(0)).template cast<T>() +
-                          static_cast<T>(alpha) *
-                              (out <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
+      ddout.device(*d) =
+          ddx *
+          ((x > static_cast<T>(0)).template cast<T>() +
+           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
+              .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index f7cc513b234e6e440507af28189ac236b71f9d15..d1a3695015abdb9ce13c4f807d1abacdf0af024d 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -28,10 +28,15 @@ using Tensor = framework::Tensor;
 
 template <typename T>
 struct Linspace<paddle::platform::CPUDeviceContext, T> {
-  void operator()(T start, T end, int count, framework::Tensor* numbers,
+  void operator()(T start, T end, int count, bool align_corners,
+                  framework::Tensor* numbers,
                   const framework::ExecutionContext& ctx) {
     T* number_data = numbers->mutable_data<T>({count}, platform::CPUPlace());
     T slice = (end - start) / (T)(count - 1);
+    if (!align_corners) {
+      slice = (end - start) / (T)count;
+      start *= (T)(count - 1) / (T)count;
+    }
     for (int i = 0; i < count; ++i) {
       number_data[i] = start + (T)i * slice;
     }
@@ -130,6 +135,10 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
         "use_cudnn",
         "(bool, default false) Only used in cudnn kernel, need install cudnn")
         .SetDefault(true);
+    AddAttr<bool>("align_corners",
+                  "(bool, default false) Whether to align the corners of input"
+                  "and ouput.")
+        .SetDefault(true);
     AddAttr<std::vector<int>>(
         "output_shape",
         "The target output image shape with format [N, C, H, W].")
@@ -164,10 +173,12 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
               [-1.  -0.5  0.   0.5  1. ]
               [-1.  -0.5  0.   0.5  1. ]
               [-1.  -0.5  0.   0.5  1. ]]]
-        C[0] is the coordinates in height axis and  C[1] is the coordinates in width axis.
+        C[0] is the coordinates in height axis and  C[1] is the coordinates in
+        width axis.
     
     Step2:
-        Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
+        Tanspose and reshape C to shape [H * W, 2] and append ones to last
+        dimension. The we get:
         C_ = [[-1.  -1.   1. ]
               [-0.5 -1.   1. ]
               [ 0.  -1.   1. ]
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7aaaa0002c5ab31af72c75e69f5a283c09633ba4
--- /dev/null
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -0,0 +1,211 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/affine_grid_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
+  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+}
+
+template <typename T>
+struct Linspace<paddle::platform::CUDADeviceContext, T> {
+  void operator()(T start, T end, int count, bool align_corners,
+                  framework::Tensor* numbers,
+                  const framework::ExecutionContext& ctx) {
+    T* number_data = numbers->mutable_data<T>({count}, ctx.GetPlace());
+    T slice = (end - start) / (T)(count - 1);
+    if (!align_corners) {
+      slice = (end - start) / (T)count;
+      start *= (T)(count - 1) / (T)count;
+    }
+    auto stream = ctx.cuda_device_context().stream();
+    int block = 512;
+    int grid = (count + block - 1) / block;
+    LinspaceKernel<T><<<grid, block, 0, stream>>>(start, slice, count,
+                                                  number_data);
+  }
+};
+
+template <typename T>
+__global__ void affine_grid_kernel(const int count, int n, int out_h, int out_w,
+                                   T h_start, T w_start, T h_step, T w_step,
+                                   const T* theta,  // N, 2, 3
+                                   T* output) {
+  CUDA_KERNEL_LOOP(index, count) {
+    int w = index % out_w;
+    int h = (index / out_w) % out_h;
+    int n = index / (out_w * out_h);
+
+    T h_coor = h_step * static_cast<T>(h) + static_cast<T>(h_start);
+    T w_coor = w_step * static_cast<T>(w) + static_cast<T>(w_start);
+
+    int theta_offset = n * 6;  // 2 * 3;
+    // affine from (h_coor, w_coor) to (x, y)
+    output[index * 2] = theta[theta_offset] * h_coor +
+                        theta[theta_offset + 1] * w_coor +
+                        theta[theta_offset + 2];
+    output[index * 2 + 1] = theta[theta_offset + 3] * h_coor +
+                            theta[theta_offset + 4] * w_coor +
+                            theta[theta_offset + 5];
+  }
+}
+
+template <typename T>
+__global__ void affine_grid_grad_kernel(const int count, int n, int out_h,
+                                        int out_w, T h_start, T w_start,
+                                        T h_step, T w_step,
+                                        const T* out_grad,  // N, H, W, 2
+                                        T* theta_grad) {    // N, 2, 3
+  CUDA_KERNEL_LOOP(index, count) {
+    int w = index % out_w;
+    int h = (index / out_w) % out_h;
+    int n = index / (out_w * out_h);
+    T h_coor = h_step * static_cast<T>(h) + static_cast<T>(h_start);
+    T w_coor = w_step * static_cast<T>(w) + static_cast<T>(w_start);
+
+    int theta_offset = n * 6;  // 2 * 3;
+    T out_grad_x = out_grad[index * 2];
+    platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);
+
+    T out_grad_y = out_grad[index * 2 + 1];
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
+  }
+}
+
+template <typename T>
+class AffineGridOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* theta = ctx.Input<Tensor>("Theta");
+    int n = theta->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+    auto* output = ctx.Output<Tensor>("Output");
+    T* out_data = output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
+
+    T h_step;
+    T w_step;
+    T h_start = -1;
+    T w_start = -1;
+    if (align_corners) {
+      h_step = static_cast<T>(2) / static_cast<T>(h - 1);
+      w_step = static_cast<T>(2) / static_cast<T>(w - 1);
+    } else {
+      h_step = static_cast<T>(2) / static_cast<T>(h);
+      w_step = static_cast<T>(2) / static_cast<T>(w);
+
+      h_start *= static_cast<T>(h - 1) / static_cast<T>(h);
+      w_start *= static_cast<T>(w - 1) / static_cast<T>(w);
+    }
+
+    const int count = n * h * w;
+    int block = 512;
+    int grid = (count + block - 1) / block;
+    auto cu_stream = ctx.cuda_device_context().stream();
+    affine_grid_kernel<<<grid, block, 0, cu_stream>>>(
+        count, n, h, w, h_start, w_start, h_step, w_step,
+        theta->data<T>(),  // N, 2, 3
+        out_data);
+  }
+};
+
+template <typename T>
+class AffineGridGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
+    int n = output_grad->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+    T* theta_grad_data = theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        ctx.cuda_device_context(), theta_grad, static_cast<T>(0));
+
+    T h_step;
+    T w_step;
+    T h_start = -1;
+    T w_start = -1;
+    if (align_corners) {
+      h_step = static_cast<T>(2) / static_cast<T>(h - 1);
+      w_step = static_cast<T>(2) / static_cast<T>(w - 1);
+    } else {
+      h_step = static_cast<T>(2) / static_cast<T>(h);
+      w_step = static_cast<T>(2) / static_cast<T>(w);
+
+      h_start *= static_cast<T>(h - 1) / static_cast<T>(h);
+      w_start *= static_cast<T>(w - 1) / static_cast<T>(w);
+    }
+    const int count = n * h * w;
+    VLOG(3) << "count: " << count << "; h_step: " << h_step
+            << "; w_step: " << w_step << "; h_start: " << h_start
+            << "; w_start: " << w_start;
+    int block = 512;
+    int grid = (count + block - 1) / block;
+    auto cu_stream = ctx.cuda_device_context().stream();
+    affine_grid_grad_kernel<<<grid, block, 0, cu_stream>>>(
+        count, n, h, w, h_start, w_start, h_step, w_step,
+        output_grad->data<T>(), theta_grad_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(affine_grid, ops::AffineGridOpCUDAKernel<float>,
+                        ops::AffineGridOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(affine_grid_grad,
+                        ops::AffineGridGradOpCUDAKernel<float>,
+                        ops::AffineGridGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h
index 73df8a38b96c30196a7e39d2cf1e348f2a7722ec..50c9ebcd9c8f52077d7f5d0abb10c631cbeee794 100644
--- a/paddle/fluid/operators/affine_grid_op.h
+++ b/paddle/fluid/operators/affine_grid_op.h
@@ -37,12 +37,13 @@ using Array4 = Eigen::DSizes<int64_t, 4>;
  */
 template <typename DeviceContext, typename T>
 struct Linspace {
-  void operator()(T start, T end, int count, framework::Tensor* numbers,
+  void operator()(T start, T end, int count, bool align_corners,
+                  framework::Tensor* numbers,
                   const framework::ExecutionContext& ctx);
 };
 
 template <typename DeviceContext, typename T>
-inline void GetIdxMap(int n, int h, int w, Tensor* grid,
+inline void GetIdxMap(int n, int h, int w, bool align_corners, Tensor* grid,
                       const framework::ExecutionContext& ctx) {
   auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
   grid->mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
@@ -50,16 +51,19 @@ inline void GetIdxMap(int n, int h, int w, Tensor* grid,
   // Get indexes of height with shape [height, width, 1]
   Tensor h_idx;
   Linspace<DeviceContext, T> linspace;
-  linspace((T)-1, (T)1, h, &h_idx, ctx);
+  linspace((T)-1, (T)1, h, align_corners, &h_idx, ctx);
   auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
   // Get indexes of width with shape [height, width, 1]
   Tensor w_idx;
-  linspace((T)-1, (T)1, w, &w_idx, ctx);
+  linspace((T)-1, (T)1, w, align_corners, &w_idx, ctx);
   auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
   // Get constant ones tensor with shape [height, width, 1]
   Tensor ones;
   ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
-  auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
+
+  math::SetConstant<DeviceContext, T>()(
+      ctx.template device_context<DeviceContext>(), &ones, static_cast<T>(1));
+  auto ones_t = EigenTensor<T, 3>::From(ones);
   // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
   // ones
   Tensor w_idx_map;
@@ -74,11 +78,9 @@ inline void GetIdxMap(int n, int h, int w, Tensor* grid,
   Tensor w_h_one_idx_map;
   w_h_one_idx_map.mutable_data<T>({h, w, 3}, ctx.GetPlace());
   auto w_h_one_idx_map_t = EigenTensor<T, 3>::From(w_h_one_idx_map);
-
   w_idx_map_t.device(place) = w_idx_t.reshape(Array2(1, w))
                                   .broadcast(Array2(h, 1))
                                   .reshape(Array3(h, w, 1));
-
   h_idx_map_t.device(place) = h_idx_t.reshape(Array2(1, h))
                                   .broadcast(Array2(w, 1))
                                   .shuffle(Array2(1, 0))
@@ -97,6 +99,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
     auto* theta = ctx.Input<Tensor>("Theta");
     int n = theta->dims()[0];
     auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
     int h = 0;
     int w = 0;
     if (size_attr.size() == 0) {
@@ -116,7 +119,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
         ctx.template device_context<DeviceContext>(), output,
         static_cast<T>(0));
     Tensor grid;
-    GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx);
+    GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
     // output = grid * theta.T
     // TODO(wanghaoshuang): Refine batched matrix multiply
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
@@ -140,6 +143,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
     auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
     int n = output_grad->dims()[0];
     auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
     int h = 0;
     int w = 0;
     if (size_attr.size() == 0) {
@@ -158,7 +162,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
         ctx.template device_context<DeviceContext>(), theta_grad,
         static_cast<T>(0));
     Tensor grid;
-    GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx);
+    GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
     // output = grid * theta.T
     // TODO(wanghaoshuang): Refine batched matrix multiply
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc
index 911757007266c9ff88b0e348d350909ce0ff0bce..736483c3304ac32491de4fd98879fbfef04f7110 100644
--- a/paddle/fluid/operators/allclose_op.cc
+++ b/paddle/fluid/operators/allclose_op.cc
@@ -22,9 +22,11 @@ namespace operators {
 class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Input", "The first input tensor to compare.");
-    AddInput("Other", "The second input tensor to compare.");
-    AddOutput("Out", "The output tensor of allclose op.");
+    AddInput("Input",
+             "The input tensor, it's data type should be float32, float64.");
+    AddInput("Other",
+             "The input tensor, it's data type should be float32, float64.");
+    AddOutput("Out", "The output tensor, it's data type is bool.");
 
     AddAttr<float>("rtol", "The relative tolerance. Default: :math:`1e-5` .")
         .SetDefault(1e-5);
@@ -36,11 +38,12 @@ class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(false);
 
     AddComment(R"DOC( 
-This operator checks if all :math:`input` and :math:`other` satisfy the condition:
+This operator checks if all :math:`x` and :math:`y` satisfy the condition:
 
-:math:`\left| input - other \right| \leq atol + rtol \times \left| other \right|`
+.. math::
+    \left| x - y \right| \leq atol + rtol \times \left| y \right|
 
-elementwise, for all elements of :math:`input` and :math:`other`. The behaviour of this
+elementwise, for all elements of :math:`x` and :math:`y`. The behaviour of this
 operator is analogous to :math:`numpy.allclose`, namely that it returns :math:`True` if
 two tensors are elementwise equal within a tolerance.
 )DOC");
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu
index 85e4f98173511435a52b32e506afc8d5b772f74f..14708c4df10f5160d0e72e7669e0015554d8215f 100644
--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/fluid/operators/arg_max_op.cu
@@ -1,29 +1,22 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    arg_max,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    double>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t>);
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    arg_max, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMax>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMax>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMax>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMax>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMax>);
diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..73581dac4e419ca9c970db4414ff54d4cbd3fd70
--- /dev/null
+++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h
@@ -0,0 +1,192 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef __NVCC__
+
+#include <cub/cub.cuh>
+#include <limits>
+#include <string>
+#include <typeinfo>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+namespace {  // NOLINT
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+using Tensor = framework::Tensor;
+
+}  // end namespace
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+template <typename T, typename IndType, class Reducer, size_t BlockDim>
+__global__ void ArgCUDAKernel(const int64_t height,     // n * h
+                              const int64_t width,      // c
+                              const int64_t post_size,  // h
+                              const Reducer reducer, const T init, const T* in,
+                              IndType* out) {
+  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      out[idx] = static_cast<IndType>(kv_pair.key);
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename IndType, class Reducer>
+void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
+                    Tensor* indices, const int64_t pre, const int64_t post,
+                    const int64_t n) {
+  auto cu_stream = ctx.stream();
+  auto ComputeBlockSize = [](int64_t col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256)
+      return 512;
+    else if (col > 128)
+      return 256;
+    else if (col > 64)
+      return 128;
+    else if (col > 32)
+      return 64;
+    else if (col > 16)
+      return 32;
+    else if (col > 8)
+      return 16;
+    else
+      return 8;
+  };
+
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
+  int64_t height = pre * post;
+  int64_t width = n;
+  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+
+  const T* in_data = input.data<T>();
+  IndType* out_data = indices->mutable_data<IndType>(ctx.GetPlace());
+
+  if (typeid(Reducer) == typeid(cub::ArgMax)) {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T, IndType, Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height, width, post, Reducer(), std::numeric_limits<T>::lowest(),
+              in_data, out_data));
+    }
+  } else {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T, IndType, Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height, width, post, Reducer(), std::numeric_limits<T>::max(),
+              in_data, out_data));
+    }
+  }
+}
+
+template <typename T, class Reducer>
+struct VisitDataCudaArgMinMaxFunctor {
+  const framework::ExecutionContext& ctx;
+
+  explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx)
+      : ctx(ctx) {}
+  template <typename IndType>
+  void apply() const {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    int axis = ctx.Attr<int64_t>("axis");
+    const bool& flatten = ctx.Attr<bool>("flatten");
+
+    framework::DDim input_dims;
+    if (flatten) {
+      input_dims = framework::make_ddim({input->numel()});
+      // if flatten, the axis just as 0
+      axis = 0;
+    } else {
+      input_dims = input->dims();
+      if (axis < 0) axis += input->dims().size();
+    }
+
+    int64_t numel = input->numel();
+    int64_t groups = numel / input_dims[axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+
+    const auto& dev_ctx = ctx.cuda_device_context();
+    ComputeFullArg<T, IndType, Reducer>(dev_ctx, *input, output, pre, post, n);
+  }
+};
+template <typename T, class Reducer>
+class ArgMinMaxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dtype = ctx.Attr<int>("dtype");
+    if (dtype < 0) {
+      framework::VisitDataType(static_cast<framework::proto::VarType::Type>(
+                                   framework::proto::VarType::INT64),
+                               VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
+      return;
+    }
+    framework::VisitDataType(
+        static_cast<framework::proto::VarType::Type>(dtype),
+        VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
+  }
+};
+
+#endif
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index 0fc7b47c62ea9d7da805b797fcf5e4db4e39328d..ae3637f6f99783d70bd57a3935a979b0387692de 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -38,8 +38,9 @@ struct ArgMinMaxFunctor {};
   struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
                           enum_argminmax_value> {                             \
     void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
-                    framework::LoDTensor* out, int64_t axis, bool keepdims) { \
-      auto in_eigen = framework::EigenTensor<T, Rank>::From(in);              \
+                    framework::LoDTensor* out, framework::DDim x_dims,        \
+                    int64_t axis, bool keepdims) {                            \
+      auto in_eigen = framework::EigenTensor<T, Rank>::From(in, x_dims);      \
       if (keepdims) {                                                         \
         auto out_eigen = framework::EigenTensor<Tout, Rank>::From(*out);      \
         out_eigen.device(*(ctx.eigen_device())) =                             \
@@ -68,16 +69,26 @@ struct VisitDataArgMinMaxFunctor {
     out.template mutable_data<Tout>(ctx.GetPlace());
     auto axis = ctx.Attr<int64_t>("axis");
     auto keepdims = ctx.Attr<bool>("keepdims");
-    auto x_rank = x.dims().size();
-    if (axis < 0) axis += x_rank;
+    const bool& flatten = ctx.Attr<bool>("flatten");
+
+    // if flatten, will construct the new dims for the cacluate
+    framework::DDim x_dims;
+    if (flatten) {
+      x_dims = framework::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) axis += x_dims.size();
+    }
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
 #define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
   ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
       functor##rank;                                                 \
-  functor##rank(dev_ctx, x, &out, axis, keepdims)
+  functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims)
 
-    switch (x.dims().size()) {
+    switch (x_dims.size()) {
       case 1:
         CALL_ARG_MINMAX_FUNCTOR(1);
         break;
@@ -141,6 +152,7 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
     const auto& x_dims = ctx->GetInputDim("X");
     int64_t axis = ctx->Attrs().Get<int64_t>("axis");
     bool keepdims = ctx->Attrs().Get<bool>("keepdims");
+    const bool& flatten = ctx->Attrs().Get<bool>("flatten");
 
     PADDLE_ENFORCE_GE(axis, -x_dims.size(),
                       platform::errors::InvalidArgument(
@@ -152,14 +164,21 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
 
-    auto x_rank = x_dims.size();
-    if (axis < 0) axis += x_rank;
     std::vector<int64_t> vec;
-    for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]);
-    if (keepdims) {
-      vec.push_back(static_cast<int64_t>(1));
+    if (flatten) {
+      // if is flatten, will return the only on element
+      if (keepdims) {
+        vec.emplace_back(static_cast<int64_t>(1));
+      }
+    } else {
+      auto x_rank = x_dims.size();
+      if (axis < 0) axis += x_rank;
+      for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
+      if (keepdims) {
+        vec.emplace_back(static_cast<int64_t>(1));
+      }
+      for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
     }
-    for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
     ctx->SetOutputDim("Out", framework::make_ddim(vec));
   }
 };
@@ -176,6 +195,9 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
     AddAttr<bool>("keepdims", "Keep the dim that to reduce.").SetDefault(false);
     AddAttr<int>("dtype", "Keep the dim that to reduce.").SetDefault(-1);
+    AddAttr<bool>("flatten",
+                  "Flatten the input value, and search the min or max indices")
+        .SetDefault(false);
     AddComment(string::Sprintf(R"DOC(
       %s Operator.
 
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
index 47d7c8b12243c6c5c501188af7f48f125c266009..23170bf0087906d752767051ce58874cb3584ee5 100644
--- a/paddle/fluid/operators/arg_min_op.cu
+++ b/paddle/fluid/operators/arg_min_op.cu
@@ -1,29 +1,21 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    arg_min,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    double>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t>);
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
+REGISTER_OP_CUDA_KERNEL(
+    arg_min, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMin>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMin>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMin>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMin>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMin>);
diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc
index 50797a100b1a67244b7c7b40b47404b60dc6af65..f56789b889526301e670ac37d1d6131aaafb070a 100644
--- a/paddle/fluid/operators/bce_loss_op.cc
+++ b/paddle/fluid/operators/bce_loss_op.cc
@@ -32,22 +32,29 @@ class BCELossOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BCELoss");
 
     auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(), label_dims.size(),
-        platform::errors::InvalidArgument(
-            "Input(X) and Input(Label) shall have the same shape."));
-    bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) ||
-                               framework::contain_unknown_dim(label_dims);
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
+    auto labels_dims = ctx->GetInputDim("Label");
+
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
+                      platform::errors::InvalidArgument(
+                          "Input(X) and Input(Label) shall have the same rank."
+                          "But received: the rank of Input(X) is [%d], "
+                          "the rank of Input(Label) is [%d].",
+                          rank, labels_dims.size()));
+
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
+
     if (check) {
-      PADDLE_ENFORCE_EQ(
-          x_dims.size(), label_dims.size(),
-          platform::errors::InvalidArgument(
-              "ShapeError: Input(X) and Input(Label) shall have the same shape "
-              "But received: the shape of Input(X) is [%s], the shape of "
-              "Input(Label) is [%s].",
-              x_dims, label_dims));
+      PADDLE_ENFORCE_EQ(x_dims, labels_dims,
+                        platform::errors::InvalidArgument(
+                            "Input(X) and Input(Label) shall have the same "
+                            "shape. But received: the shape of Input(X) is "
+                            "[%s], the shape of Input(Label) is [%s].",
+                            x_dims, labels_dims));
     }
 
     ctx->ShareDim("X", "Out");
@@ -76,20 +83,31 @@ class BCELossGradOp : public framework::OperatorWithKernel {
                    framework::GradVarName("X"), "BCELossGrad");
 
     auto x_dims = ctx->GetInputDim("X");
+    auto labels_dims = ctx->GetInputDim("Label");
     auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) ||
-                               framework::contain_unknown_dim(dout_dims);
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
+
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
+
     if (check) {
+      PADDLE_ENFORCE_EQ(x_dims, labels_dims,
+                        platform::errors::InvalidArgument(
+                            "Input(X) and Input(Label) shall have the same "
+                            "shape. But received: the shape of Input(X) is "
+                            "[%s], the shape of Input(Label) is [%s].",
+                            x_dims, labels_dims));
+
       PADDLE_ENFORCE_EQ(x_dims, dout_dims,
                         platform::errors::InvalidArgument(
-                            "ShapeError:The Input(X) and Input(Out@Grad) "
-                            "should have the same "
-                            "shape, But received: the shape of Input(X) is "
-                            "[%s], the shape of "
-                            "Input(Out@GRAD) is [%s].",
+                            "Input(X) and Input(Out@Grad) shall have the same "
+                            "shape. But received: the shape of Input(X) is "
+                            "[%s], the shape of Input(Out@Grad) is [%s].",
                             x_dims, dout_dims));
     }
+
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
     ctx->ShareLoD("X", framework::GradVarName("X"));
   }
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index 8e30f4eb15b6afde885512206c7eaeb721cdd44b..16db4f05e31d365d8d06174ab708e30474b8a8c2 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -67,7 +67,8 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
 
     auto x_data = x->data<T>();
     auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    int x_numel = x->numel();
+    auto x_numel = x->numel();
+
     platform::GpuLaunchConfig config =
         platform::getGpuLaunchConfig(x_numel, ctx);
 
@@ -75,7 +76,7 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
     framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu);
     T* x_cpu_data = x_cpu.data<T>();
 
-    for (int i = 0; i < x_numel; ++i) {
+    for (int64_t i = 0; i < x_numel; ++i) {
       PADDLE_ENFORCE_GE(
           x_cpu_data[i], static_cast<T>(0),
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/bce_loss_op.h b/paddle/fluid/operators/bce_loss_op.h
index 85e120e4642a298ebff00fc0e4b6425f775443aa..dd87b69efe2869727f2db778cec44612efbcff6b 100644
--- a/paddle/fluid/operators/bce_loss_op.h
+++ b/paddle/fluid/operators/bce_loss_op.h
@@ -34,11 +34,11 @@ class BCELossOpKernel : public framework::OpKernel<T> {
     auto x_data = x->data<T>();
     auto label_data = labels->data<T>();
     auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    int x_numel = x->numel();
+    auto x_numel = x->numel();
 
     // out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 -
     // x) - label * ln(x)
-    for (int i = 0; i < x_numel; ++i) {
+    for (int64_t i = 0; i < x_numel; ++i) {
       PADDLE_ENFORCE_GE(
           x_data[i], static_cast<T>(0),
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/bernoulli_op.cc b/paddle/fluid/operators/bernoulli_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79c4e2c2bba3191535f53e2ef2a32cd66e36230c
--- /dev/null
+++ b/paddle/fluid/operators/bernoulli_op.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/bernoulli_op.h"
+
+#include <algorithm>
+#include <string>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+
+namespace paddle {
+namespace operators {
+
+class BernoulliOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "A tensor with probabilities for generating the random binary "
+             "number");
+    AddOutput("Out", "A Tensor filled with random binary number");
+    AddComment(R"DOC(
+This OP returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
+
+    Out ~ Bernoulli(X)
+
+)DOC");
+  }
+};
+
+class BernoulliOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    return UnaryOpUnchangedInferShape(ctx);
+  }
+};
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class BernoulliOpKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto *in_data = x->data<T>();
+    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int64_t size = x->numel();
+    std::uniform_real_distribution<T> dist(0.0, 1.0);
+    auto gen_ptr = framework::DefaultCPUGenerator();
+    auto engine = gen_ptr->GetCPUEngine();
+
+    for (int64_t i = 0; i < size; ++i) {
+      out_data[i] = BernoulliFunctor(in_data[i], dist(*engine));
+    }
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OPERATOR(
+    bernoulli, ops::BernoulliOp, ops::BernoulliOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(bernoulli,
+                       ops::BernoulliOpKernel<plat::CPUDeviceContext, float>,
+                       ops::BernoulliOpKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/bernoulli_op.cu b/paddle/fluid/operators/bernoulli_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d0837071d456068f64ebc74b115f1a7904eba41c
--- /dev/null
+++ b/paddle/fluid/operators/bernoulli_op.cu
@@ -0,0 +1,72 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/execution_policy.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/bernoulli_op.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+// it can be consistent with cpu when CUDAGenerator is provided.
+template <typename T>
+struct BernoulliCudaFunctor {
+  unsigned int seed_;
+  __host__ __device__ BernoulliCudaFunctor(int seed) : seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n, const T p) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(0.0, 1.0);
+    rng.discard(n);
+    return static_cast<T>(dist(rng) < p);
+  }
+};
+
+template <typename T>
+class BernoulliOpKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::random_device rd;
+    auto seed = rd();
+    const auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto* in_data = x->data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int64_t size = x->numel();
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    platform::Transform<platform::CUDADeviceContext> trans;
+    auto* context =
+        static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
+    trans(*context, index_sequence_begin, index_sequence_begin + size, in_data,
+          out_data, BernoulliCudaFunctor<T>(seed));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    bernoulli, ops::BernoulliOpKernel<plat::CUDADeviceContext, float>,
+    ops::BernoulliOpKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/bernoulli_op.h b/paddle/fluid/operators/bernoulli_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..06a83ada17bb926d6f7d4eef10750986d00f048c
--- /dev/null
+++ b/paddle/fluid/operators/bernoulli_op.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+/**
+ * Samples a bernoulli distribution given a probability input
+ */
+
+template <typename T>
+inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
+  PADDLE_ENFORCE_LE(p, 1, platform::errors::OutOfRange(
+                              "The probability should be <= 1, but got %f", p));
+  PADDLE_ENFORCE_GE(p, 0, platform::errors::OutOfRange(
+                              "The probability should be >= 1, but got %f", p));
+  return static_cast<T>(rand < p);
+}
+
+template <typename DeviceContext, typename T>
+class BernoulliOpKernel;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cholesky_op.cu b/paddle/fluid/operators/cholesky_op.cu
index c44299686516e968692fe146a5c324c7f1fa83d2..530147609fe1e47320a1cbd9223ccdfb82ba7e7a 100644
--- a/paddle/fluid/operators/cholesky_op.cu
+++ b/paddle/fluid/operators/cholesky_op.cu
@@ -63,7 +63,6 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
       for_range(matrix_band_part_functor);
     }
 
-    // TODO(guosheng): Add callback to check info
     auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_count);
     auto* info_ptr = reinterpret_cast<int*>(info->ptr());
 
@@ -96,6 +95,20 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
 #if CUDA_VERSION >= 9020 && !defined(_WIN32)
     }
 #endif
+    // check the info
+    std::vector<int> error_info;  // only for checking positive matrix
+    error_info.resize(batch_count);
+
+    memory::Copy(platform::CPUPlace(), error_info.data(),
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 info_ptr, sizeof(int) * batch_count, dev_ctx.stream());
+
+    for (int i = 0; i < batch_count; ++i) {
+      PADDLE_ENFORCE_EQ(error_info[i], 0,
+                        platform::errors::PreconditionNotMet(
+                            "For batch [%d]: U(%d, %d) is zero, singular U.", i,
+                            error_info[i], error_info[i]));
+    }
   }
 
   void Potrf(const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,
diff --git a/paddle/fluid/operators/cholesky_op.h b/paddle/fluid/operators/cholesky_op.h
index b0280b00ecf447d36b199e6b6765fa7928e081f0..15dd8315362ed0221c5c8b9c523af37da38dfd7e 100644
--- a/paddle/fluid/operators/cholesky_op.h
+++ b/paddle/fluid/operators/cholesky_op.h
@@ -59,22 +59,24 @@ class CholeskyCPUKernel : public framework::OpKernel<T> {
             Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
             Eigen::UpLoType::Upper>
             llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(
-            llt_decomposition.info(), Eigen::Success,
-            platform::errors::InvalidArgument(
-                "Cholesky decomposition was not successful. The input matrice "
-                "might not be not be positive definite."));
+        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
+                          platform::errors::InvalidArgument(
+                              "Cholesky decomposition was not successful. The "
+                              "%d-th input matrice "
+                              "might not be not be positive definite.",
+                              i));
         output = llt_decomposition.matrixU();
       } else {
         Eigen::LLT<
             Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
             Eigen::UpLoType::Lower>
             llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(
-            llt_decomposition.info(), Eigen::Success,
-            platform::errors::InvalidArgument(
-                "Cholesky decomposition was not successful. The input matrice "
-                "might not be not be positive definite."));
+        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
+                          platform::errors::InvalidArgument(
+                              "Cholesky decomposition was not successful. The "
+                              "%d-th input matrice "
+                              "might not be not be positive definite.",
+                              i));
         output = llt_decomposition.matrixL();
       }
     }
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
index a8485a148b17c1a084b9d294c998531ec3a8e071..03abfe7eb703b021dac2261dcd9c87d440b04001 100644
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -66,7 +66,7 @@ template <typename DeviceContext, typename T>
 class ClipKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto max = static_cast<T>(context.Attr<float>("max"));
+    auto max = context.Attr<T>("max");
     Tensor max_cpu;
     if (context.HasInput("Max")) {
       auto* max_t = context.Input<Tensor>("Max");
@@ -77,9 +77,8 @@ class ClipKernel : public framework::OpKernel<T> {
       }
       max = max_data[0];
     }
-    max = static_cast<T>(max);
 
-    auto min = context.Attr<float>("min");
+    auto min = context.Attr<T>("min");
     Tensor min_cpu;
     if (context.HasInput("Min")) {
       auto* min_t = context.Input<Tensor>("Min");
@@ -90,11 +89,12 @@ class ClipKernel : public framework::OpKernel<T> {
       }
       min = min_data[0];
     }
-    min = static_cast<T>(min);
-    PADDLE_ENFORCE_LT(min, max, platform::errors::InvalidArgument(
-                                    "max should be greater than min. "
-                                    "But received min = %f, max = %f",
-                                    min, max));
+
+    PADDLE_ENFORCE_LE(min, max,
+                      platform::errors::InvalidArgument(
+                          "max should be greater than or equal to min. "
+                          "But received min = %f, max = %f",
+                          min, max));
 
     auto* x_var = context.InputVar("X");
     if (x_var->IsType<framework::LoDTensor>()) {
@@ -141,7 +141,7 @@ template <typename DeviceContext, typename T>
 class ClipGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto max = static_cast<T>(context.Attr<float>("max"));
+    auto max = context.Attr<T>("max");
     Tensor max_cpu;
     if (context.HasInput("Max")) {
       auto* max_t = context.Input<Tensor>("Max");
@@ -152,9 +152,8 @@ class ClipGradKernel : public framework::OpKernel<T> {
       }
       max = max_data[0];
     }
-    max = static_cast<T>(max);
 
-    auto min = context.Attr<float>("min");
+    auto min = context.Attr<T>("min");
     Tensor min_cpu;
     if (context.HasInput("Min")) {
       auto* min_t = context.Input<Tensor>("Min");
@@ -165,7 +164,6 @@ class ClipGradKernel : public framework::OpKernel<T> {
       }
       min = min_data[0];
     }
-    min = static_cast<T>(min);
 
     auto* d_out =
         context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 3f9423ae5c26433084eb040a4c38b14feabfe89e..686b3039d4dea93042463250e17f7558a318377c 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -35,5 +35,9 @@ if(WITH_NCCL)
     op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} nccl_common)
 endif()
 
+if(WITH_GLOO)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
+endif()
+
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
 set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
diff --git a/paddle/fluid/operators/collective/barrier_op.cc b/paddle/fluid/operators/collective/barrier_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f154a42e2be8f825e6b4a386c0c262f31b0edda
--- /dev/null
+++ b/paddle/fluid/operators/collective/barrier_op.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/barrier_op.h"
+
+#include <memory>
+
+namespace paddle {
+namespace operators {
+
+class BarrierOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {}
+};
+
+class BarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) Input data (only used in CUDAKernel).");
+    AddOutput("Out", "(Tensor) Output data (only used in CUDAKernel).");
+    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+Barrier Operator - Barrier among all pariticapitors.)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(barrier, ops::BarrierOp, ops::BarrierOpMaker);
+REGISTER_OP_CPU_KERNEL(barrier, ops::BarrierOpCPUKernel<int>);
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3cad7bda63046fb5c980f9dc1c902d51f51f759
--- /dev/null
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/barrier_op.h"
+
+#include <memory>
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class BarrierOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    auto stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    ncclRedOp_t nccl_red_type = ncclSum;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
+    auto comm_stream =
+        platform::NCCLCommContext::Instance().Get(rid, place)->stream();
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(comm_stream));
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with NCCL."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(barrier, ops::BarrierOpCUDAKernel<int>);
diff --git a/paddle/fluid/operators/collective/barrier_op.h b/paddle/fluid/operators/collective/barrier_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..60a195a43540669921e99ab1d0f6163179ba0455
--- /dev/null
+++ b/paddle/fluid/operators/collective/barrier_op.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/barrier.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class BarrierOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_GLOO)
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::BarrierOptions opts(gloo->GetContext());
+    gloo::barrier(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allgather_op.h b/paddle/fluid/operators/collective/c_allgather_op.h
index fe99a9e128d1892a093c090f33f065ae2a158056..ec55a14d085e5ef23074b4924b453a74e7478216 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.h
+++ b/paddle/fluid/operators/collective/c_allgather_op.h
@@ -23,6 +23,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/allgather.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -30,7 +35,31 @@ template <typename T>
 class CAllGatherOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("unimplemented cpu kernel for CAllGatherOp.");
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    framework::DDim out_dims = in->dims();
+    auto place = ctx.GetPlace();
+
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    auto nranks = gloo->Size();
+    out_dims[0] *= nranks;
+    int64_t send_numel = in->numel();
+    const T* send_buff = in->data<T>();
+    T* recv_buff = out->mutable_data<T>(out_dims, place);
+
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::AllgatherOptions opts(gloo->GetContext());
+    opts.setInput(const_cast<T*>(send_buff), send_numel);
+    opts.setOutput(recv_buff, send_numel * nranks);
+    gloo::allgather(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 096a2f6a0959768bcb99d87b0d42edf71d98f481..be518b3bf0a397d7c20f6f1f03878987a43e84dc 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -25,6 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/allreduce.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -50,7 +55,53 @@ template <ReduceType red_type, typename T>
 class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("CAllReduce op do not support CPUKernel for now.");
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    int64_t send_numel = in->numel();
+    const T* send_buff = in->data<T>();
+    T* recv_buff = out->mutable_data<T>(in->dims(), place);
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::AllreduceOptions opts(gloo->GetContext());
+    opts.setInput(const_cast<T*>(send_buff), send_numel);
+    opts.setOutput(recv_buff, send_numel);
+    switch (red_type) {
+      case kRedSum:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::sum<T>));
+        break;
+      case kRedMax:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::max<T>));
+        break;
+      case kRedMin:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::min<T>));
+        break;
+      case kRedProd:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::product<T>));
+        break;
+      default:
+        PADDLE_ENFORCE_EQ(true, false,
+                          platform::errors::InvalidArgument(
+                              "Invalid reduce type: %d.", red_type));
+    }
+    gloo::allreduce(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.h b/paddle/fluid/operators/collective/c_broadcast_op.h
index 4ceb0aa835fe116cdc14444dfb7ea6046f33c482..eb4acb9a369fc7bfa8e23b5544f54c955d4a87b6 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.h
+++ b/paddle/fluid/operators/collective/c_broadcast_op.h
@@ -22,6 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/broadcast.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -29,7 +34,27 @@ template <typename T>
 class CBroadcastOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("Unimplemented cpu kernel for CBroadcastOp.");
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto root = ctx.Attr<int>("root");
+
+    auto place = ctx.GetPlace();
+    int64_t send_numel = in->numel();
+    T* recv_buff = out->mutable_data<T>(in->dims(), place);
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::BroadcastOptions opts(gloo->GetContext());
+    opts.setOutput(recv_buff, send_numel);
+    opts.setRoot(root);
+    gloo::broadcast(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..425351877689f7e3ad8e0a46d2226f5f751a4016
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CReduceMaxOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Max"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_max, ops::CReduceOp,
+                             ops::CReduceMaxOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_reduce_max,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e260346b4bdd8aced0df59c72f5adb4c479e8d0
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_reduce_max,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e849641e639eeceb48fc95656b269988c827006
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CReduceMinOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Min"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_min, ops::CReduceOp,
+                             ops::CReduceMinOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_reduce_min,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..77a75ed0b7af2a7946c02bfa0f33038aa0090c5b
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_reduce_min,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..81dc5c35bf14e569fe90743c1dc62a61fd5655ba
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -0,0 +1,201 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/reduce.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+enum ReduceType { kRedSum, kRedMax, kRedMin, kRedProd };
+
+class CReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CReduceOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto root_id = ctx.Attr<int>("root_id");
+
+    auto place = ctx.GetPlace();
+    int64_t send_numel = in->numel();
+    const T* send_buff = in->data<T>();
+    T* recv_buff = out->mutable_data<T>(in->dims(), place);
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::ReduceOptions opts(gloo->GetContext());
+    opts.setInput(const_cast<T*>(send_buff), send_numel);
+    opts.setOutput(recv_buff, send_numel);
+    opts.setRoot(root_id);
+    switch (red_type) {
+      case kRedSum:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::sum<T>));
+        break;
+      case kRedMax:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::max<T>));
+        break;
+      case kRedMin:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::min<T>));
+        break;
+      case kRedProd:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::product<T>));
+        break;
+      default:
+        PADDLE_ENFORCE_EQ(true, false,
+                          platform::errors::InvalidArgument(
+                              "Invalid reduce type: %d.", red_type));
+    }
+    gloo::reduce(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CReduceOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    int root = ctx.Attr<int>("root_id");
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+
+    cudaStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    ncclRedOp_t nccl_red_type = ncclSum;
+    switch (red_type) {
+      case kRedSum:
+        nccl_red_type = ncclSum;
+        break;
+
+      case kRedMax:
+        nccl_red_type = ncclMax;
+        break;
+
+      case kRedMin:
+        nccl_red_type = ncclMin;
+        break;
+
+      case kRedProd:
+        nccl_red_type = ncclProd;
+        break;
+
+      default:
+        PADDLE_ENFORCE_EQ(true, false, platform::errors::InvalidArgument(
+                                           "red_type must be one of kRedSum, "
+                                           "kRedMax, kRedMin, kRedProd."));
+    }
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
+        sendbuff, recvbuff, numel, dtype, nccl_red_type, root, comm->comm(),
+        stream));
+#else
+    PADDLE_ENFORCE_EQ(true, false,
+                      platform::errors::Unavailable(
+                          "PaddlePaddle should compile with GPU.."));
+#endif
+  }
+};
+
+class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor), tensor to be reduced.");
+    AddOutput("Out", "(Tensor) the reduced result.");
+    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+CReduce %s Operator
+
+Call collective Reduce with reduce type %s. If input and output are
+the same variable, in-place reduce will be used.
+)DOC",
+                               GetName(), GetName()));
+  }
+
+ protected:
+  virtual std::string GetName() const = 0;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..64935df856ec79f427bdcd21e03b7c493c31ac1e
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CReduceProdOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Prod"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_prod, ops::CReduceOp,
+                             ops::CReduceProdOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_reduce_prod,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07e431f7bc838caa9bc3abdcd0be1beb94b96635
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_reduce_prod,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e20cee7e186a462aedc1881c6e34cacc8d09de0
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CReduceSumOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Sum"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_sum, ops::CReduceOp,
+                             ops::CReduceSumOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_reduce_sum,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9826422c16cb67f9f7101643918a83898c606b3
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_reduce_sum,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cc b/paddle/fluid/operators/collective/c_scatter_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..908708e6e328f54466d4bb69b30fd607e14d1fe9
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_scatter_op.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_scatter_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CScatterOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "CScatter");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CScatter");
+    int root_id = ctx->Attrs().Get<int>("root");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    int nranks = ctx->Attrs().Get<int>("nranks");
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The number of ranks (%d) must be greater than 1 "
+                          "to use collective op (c_scatter op).",
+                          nranks));
+    PADDLE_ENFORCE_GE(
+        root_id, 0,
+        platform::errors::InvalidArgument(
+            "The root_id (%d) for c_scatter_op must be non-negative.",
+            root_id));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_scatter_op must be non-negative.",
+            root_id));
+    framework::DDim dim = ctx->GetInputDim("X");
+    dim[0] = dim[0] / nranks;
+    if (dim[0] < 0) dim[0] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class CScatterOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be broadcasted.");
+    AddOutput("Out", "(Tensor) the result of broadcast.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("root", "(int default 0) root id for broadcasting.")
+        .SetDefault(0);
+    AddAttr<int>("nranks", "(int default 1) number of ranks.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+CScatter Operator
+Scatter the source to all participators.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_scatter, ops::CScatterOp, ops::CScatterOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_scatter, ops::CScatterOpCPUKernel<float>,
+                       ops::CScatterOpCPUKernel<double>,
+                       ops::CScatterOpCPUKernel<int>,
+                       ops::CScatterOpCPUKernel<int64_t>,
+                       ops::CScatterOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d9e6b4b7d99044f584e9e21062a786252d60f76
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_scatter_op.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CScatterOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    int numel = x->numel();
+    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
+
+    int nranks = ctx.Attr<int>("nranks");
+    int root_id = ctx.Attr<int>("root");
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+    PADDLE_ENFORCE_EQ(nranks, comm->nranks(),
+                      platform::errors::InvalidArgument(
+                          "The number of ranks (%d) you set of must "
+                          "be equal to comm->nranks (%d).",
+                          nranks, comm->nranks()));
+    PADDLE_ENFORCE_GE(
+        root_id, 0,
+        platform::errors::InvalidArgument(
+            "The root_id (%d) for c_scatter_op must be non-negative.",
+            root_id));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_scatter_op must be non-negative.",
+            ring_id));
+
+    cudaStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    framework::DDim x_dims = x->dims();
+    framework::DDim out_dims(x_dims);
+    framework::Tensor temp;
+    auto out_ptr = temp.mutable_data<T>(out_dims, place);
+    if (root_id == comm->rank()) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype,
+          root_id, comm->comm(), stream));
+
+      framework::TensorCopy(*static_cast<const framework::Tensor*>(x), place,
+                            *platform::DeviceContextPool::Instance().Get(place),
+                            static_cast<framework::Tensor*>(&temp));
+    } else {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+          out_ptr, numel, dtype, root_id, comm->comm(), stream));
+    }
+
+    out_dims[0] = out_dims[0] / nranks;
+    auto start_index = out_dims[0] * comm->rank();
+    auto end_index = start_index + out_dims[0];
+    temp = temp.Slice(start_index, end_index);
+    temp.Resize(out_dims);
+    out->mutable_data<T>(out_dims, place);
+    framework::TensorCopySync(*static_cast<const framework::Tensor*>(&temp),
+                              place, static_cast<framework::Tensor*>(out));
+    out->Resize(out_dims);
+#else
+    PADDLE_ENFORCE_EQ(
+        true, false,
+        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_scatter, ops::CScatterOpCUDAKernel<float>,
+                        ops::CScatterOpCUDAKernel<double>,
+                        ops::CScatterOpCUDAKernel<int>,
+                        ops::CScatterOpCUDAKernel<int64_t>,
+                        ops::CScatterOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_scatter_op.h b/paddle/fluid/operators/collective/c_scatter_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..71a5f488ebc11a93cece9b85f6af288a4662b2d8
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_scatter_op.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/scatter.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CScatterOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto root_id = ctx.Attr<int>("root");
+
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+
+    int64_t send_numel = out->numel();
+    auto nranks = gloo->Size();
+    auto rank = gloo->Rank();
+    T* recv_buff = out->data<T>();
+    gloo::ScatterOptions opts(gloo->GetContext());
+    if (root_id == rank) {
+      T* send_buff = const_cast<T*>(in->data<T>());
+      std::vector<T*> ptrs(nranks);
+      for (int i = 0; i < nranks; ++i) {
+        ptrs[i] = send_buff;
+        send_buff += send_numel;
+      }
+      opts.setInputs(ptrs, send_numel);
+    }
+    opts.setOutput(recv_buff, send_numel);
+    opts.setRoot(root_id);
+
+    gloo::scatter(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index 74589dcb6a74c79299ef682de0bce146f33ec261..fb8cde70f5324f42fbc05fdfd65b548e0e58206a 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/logical_op.h"
+#include <algorithm>
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -97,19 +99,19 @@ class BinaryLogicalOp : public LogicalOp {
     OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", comment.type);
     auto dim_x = context->GetInputDim("X");
     auto dim_y = context->GetInputDim("Y");
-
-    int product_x = framework::product(dim_x);
-    int product_y = framework::product(dim_y);
-    bool check = context->IsRuntime() || (product_x >= 0 && product_y >= 0);
-    if (check) {
-      PADDLE_ENFORCE_EQ(product_x, product_y,
-                        platform::errors::InvalidArgument(
-                            "The number of elements in X and Y should be same, "
-                            "but received %d != %d",
-                            product_x, product_y));
+    if (dim_x == dim_y) {
+      context->SetOutputDim("Out", dim_x);
+    } else {
+      int max_dim = std::max(dim_x.size(), dim_y.size());
+      int axis = std::abs(dim_x.size() - dim_y.size());
+      std::vector<int> x_dims_array(max_dim);
+      std::vector<int> y_dims_array(max_dim);
+      std::vector<int> out_dims_array(max_dim);
+      GetBroadcastDimsArrays(dim_x, dim_y, x_dims_array.data(),
+                             y_dims_array.data(), out_dims_array.data(),
+                             max_dim, axis);
+      context->SetOutputDim("Out", framework::make_ddim(out_dims_array));
     }
-
-    context->SetOutputDim("Out", context->GetInputDim("X"));
     context->ShareLoD("X", "Out");
   }
 };
diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h
index 4a83e0fda6e4ecdb1112f096eb37159337c37147..2c39201a426a25bb8595f415d80192080f1f1931 100644
--- a/paddle/fluid/operators/controlflow/logical_op.h
+++ b/paddle/fluid/operators/controlflow/logical_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <math.h>
 #include <type_traits>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
@@ -57,10 +58,8 @@ class BinaryLogicalOpKernel
     auto* y = context.Input<framework::Tensor>("Y");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor binary_func;
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(), y->data<T>(),
-          out->mutable_data<bool>(context.GetPlace()), binary_func);
+    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, -1,
+                                                          binary_func, out);
   }
 };
 
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index b44aa4ce4f893720ef55a7daf1d7b1e757c7480c..25e887ba6675e6c28bcd44c3b57c2ea571c075e3 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -37,6 +37,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   auto filter_dims = ctx->GetInputDim("Filter");
   std::vector<int> output_size =
       ctx->Attrs().Get<std::vector<int>>("output_size");
+  std::vector<int> output_padding =
+      ctx->Attrs().Get<std::vector<int>>("output_padding");
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
@@ -78,6 +80,12 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
         platform::errors::InvalidArgument(
             "The Attr(output_size) and Attr(stride) of Op(conv_transpose) "
             "should be the same."));
+  if (output_padding.size())
+    PADDLE_ENFORCE_EQ(
+        output_padding.size(), strides.size(),
+        platform::errors::InvalidArgument(
+            "The Attr(output_padding) and Attr(stride) of Op(conv_transpose) "
+            "should be the same."));
 
   const int64_t C =
       (data_layout != DataLayout::kNHWC ? in_dims[1]
@@ -136,6 +144,27 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                 infer_shape + strides[i]));
       }
       output_shape.push_back(output_size[i]);
+    } else if (output_padding.size()) {
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_GE(
+            output_padding[i], 0,
+            platform::errors::InvalidArgument(
+                "output_padding of Op(ConvTransposeOp) should not be "
+                "less than the 0. But received output_padding = "
+                "[%s], whose dim %d is less than 0",
+                framework::make_ddim(output_padding), i));
+        PADDLE_ENFORCE_LT(
+            output_padding[i], std::max(strides[i], dilations[i]),
+            platform::errors::InvalidArgument(
+                "output_padding of Op(ConvTransposeOp) should be less "
+                "than either stride or dilation. But received output_size = "
+                "[%s], "
+                "whose dim %d is not less than either stride (%d)  or "
+                "dilation (%d)",
+                framework::make_ddim(output_size), i, strides[i],
+                dilations[i]));
+      }
+      output_shape.push_back((infer_shape + output_padding[i]));
     } else {
       output_shape.push_back(infer_shape);
     }
@@ -223,10 +252,14 @@ void Conv2DTransposeOpMaker::Make() {
            "The format of output tensor is X (one-dimensional) of size equal"
            "to the number of output channels. Only used with MKL-DNN.")
       .AsDispensable();
-
   AddOutput("Output",
             "(Tensor) The output tensor of convolution transpose operator. "
             "The format of output tensor is the same as input tensor.");
+  AddAttr<std::vector<int>>("output_padding",
+                            "(vector<int> default: []), Additional size added "
+                            "to one side of each dimension in the output "
+                            "shape")
+      .SetDefault({});
   AddAttr<std::vector<int>>("output_size",
                             "(vector<int> default: []), the "
                             "size of the output tensor")
@@ -338,6 +371,11 @@ void Conv3DTransposeOpMaker::Make() {
             "Where N is batch size, C is "
             "the number of channels, D is the depth of the feature, H is the "
             "height of the feature, and W is the width of the feature.");
+  AddAttr<std::vector<int>>("output_padding",
+                            "(vector<int> default: []), Additional size added "
+                            "to one side of each dimension in the output "
+                            "shape")
+      .SetDefault({});
   AddAttr<std::vector<int>>("output_size",
                             "(vector<int> default: []), the "
                             "size of the output tensor")
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
old mode 100755
new mode 100644
index 2e9db16be5530f65e0cf5d4d99ee2ad936105983..89ec1ddd12b9d8da8dba604ae4e759054212608e
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/cum_op.h"
 
 namespace paddle {
@@ -95,3 +96,14 @@ REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
                        ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
                        ops::CumKernel<CPU, ops::CumsumFunctor<int>>,
                        ops::CumKernel<CPU, ops::CumsumFunctor<int64_t>>);
+
+REGISTER_OP_VERSION(cumsum)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade cumsum add a new attribute [flatten].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "flatten",
+            "In order to compute the cumsum over the flattened array when the "
+            "argument `axis` in python API is None.",
+            false));
diff --git a/paddle/fluid/operators/cvm_op.h b/paddle/fluid/operators/cvm_op.h
index c6140483ff5cb8108895546b6a01f058708231fd..956fd5ad3035434fbf3093786319b3f7ab7e7354 100644
--- a/paddle/fluid/operators/cvm_op.h
+++ b/paddle/fluid/operators/cvm_op.h
@@ -68,8 +68,19 @@ class CVMOpKernel : public framework::OpKernel<T> {
 
     // for Input X do not have Lod Information.
     if (x->NumLevels() == 0) {
-      for (int i = 0; i < batch_size; i++) {
-        CvmComputeKernel(use_cvm, item_size, &x_data, &y_data);
+      if (use_cvm) {
+        for (int i = 0; i < batch_size; i++) {
+          int cursor = i * item_size;
+          y_data[cursor] = log(x_data[cursor] + 1);
+          y_data[cursor + 1] = log(x_data[cursor + 1] + 1) - y_data[cursor];
+          for (int j = 2; j < item_size; j++) {
+            y_data[cursor + j] = x_data[cursor + j];
+          }
+        }
+      } else {
+        for (int i = 0; i < batch_size; i++) {
+          CvmComputeKernel(use_cvm, item_size, &x_data, &y_data);
+        }
       }
     } else {
       auto lod = x->lod()[0];
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index 16e1699e12c832d54af14f673577dcc32b015d6d..5cd853758926e622d0f87e6f8bbaba2cf3b9f85e 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -222,10 +222,12 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
     } else {
       auto lod = dist_mat->lod().back();
       for (size_t i = 0; i < lod.size() - 1; ++i) {
-        Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
-        BipartiteMatch(one_ins, indices + i * col, dist + i * col);
-        if (type == "per_prediction") {
-          ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
+        if (lod[i + 1] > lod[i]) {
+          Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
+          BipartiteMatch(one_ins, indices + i * col, dist + i * col);
+          if (type == "per_prediction") {
+            ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
+          }
         }
       }
     }
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67dc2843345682b2dfe3d568e452461860575544
--- /dev/null
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -0,0 +1,140 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/diag_v2_op.h"
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class DiagV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "diag_v2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diag_v2");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto offset = ctx->Attrs().Get<int>("offset");
+
+    if (x_dims.size() == 1UL) {
+      int64_t size = x_dims[0] + std::abs(offset);
+      ctx->SetOutputDim("Out", {size, size});
+    } else if (x_dims.size() == 2UL) {
+      int64_t size;
+      if (offset >= 0) {
+        size = std::min(x_dims[0], x_dims[1] - offset);
+      } else {
+        size = std::min(x_dims[0] + offset, x_dims[1]);
+      }
+      ctx->SetOutputDim("Out", {size});
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The input tensor X's dimensions of DiagV2Op should be either 1 or "
+          "2, but received %d.",
+          x_dims.size()));
+    }
+  }
+};
+
+class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor. Its shape is either 1-D or 2-D.");
+    AddOutput("Out", "The output tensor. A square matrix or a vector.");
+    AddAttr<int>("offset",
+                 "The diagonal offset. A positive value represents "
+                 "superdiagonal, 0 represents the main diagonal, and a "
+                 "negative value represents subdiagonal.")
+        .SetDefault(0);
+    AddAttr<float>("padding_value",
+                   "Use this value to fill the area outside the specified "
+                   "diagonal band. Only takes effect when the input is a 1-D "
+                   "Tensor. The default value is 0.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+      If ``x`` is a vector (1-D tensor), a 2-D square tensor whth the elements of ``x`` as the diagonal is returned.
+
+      If ``x`` is a matrix (2-D tensor), a 1-D tensor with the diagonal elements of ``x`` is returned.
+
+      The argument ``offset`` controls the diagonal offset:
+
+      If ``offset`` = 0, it is the main diagonal.
+
+      If ``offset`` > 0, it is superdiagonal.
+
+      If ``offset`` < 0, it is subdiagonal.
+)DOC");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DiagV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* x_data = X->data<T>();
+    auto x_dims = X->dims();
+    int offset = context.Attr<int>("offset");
+    auto* out = context.Output<framework::Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_dims = out->dims();
+
+    int64_t i;
+    if (x_dims.size() == 1) {
+      float padding_value = context.Attr<float>("padding_value");
+      math::SetConstant<DeviceContext, T> set_padding_value;
+      auto& dev_ctx = context.template device_context<DeviceContext>();
+      set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
+
+      auto x_length = x_dims[0];
+      const int& x_stride = ComputeStride(0, x_dims);
+
+      auto out_stride_0 = ComputeStride(0, out_dims);
+      auto out_stride_1 = ComputeStride(1, out_dims);
+      out_data +=
+          (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
+
+      for (i = 0; i < x_length; i++) {
+        out_data[i * (out_stride_0 + out_stride_1)] = x_data[i * x_stride];
+      }
+    } else {
+      auto out_length = out_dims[0];
+      const int& x_stride_0 = ComputeStride(0, x_dims);
+      const int& x_stride_1 = ComputeStride(1, x_dims);
+
+      auto out_stride_0 = ComputeStride(0, out_dims);
+      x_data += (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
+      for (i = 0; i < out_length; i++) {
+        out_data[i * out_stride_0] = x_data[i * (x_stride_0 + x_stride_1)];
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    diag_v2, ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, int>,
+    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, double>,
+    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/diag_v2_op.cu b/paddle/fluid/operators/diag_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4386cc6b8183c03b4d4a19aba7d1126eac2ab495
--- /dev/null
+++ b/paddle/fluid/operators/diag_v2_op.cu
@@ -0,0 +1,122 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/diag_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+// Extract the diagonal of a matrix 'x' to a vector 'out'.
+template <typename T>
+__global__ void ExtractDiagonalKernel(T* out, const T* x, std::ptrdiff_t start,
+                                      std::ptrdiff_t size,
+                                      const std::ptrdiff_t sumStride,
+                                      const std::ptrdiff_t outStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    const std::ptrdiff_t xOffset = start + sumStride * idx;
+    out[outStride * idx] = x[xOffset];
+  }
+}
+
+// Paste a vector 'x' to the diagonal of a matrix 'out'
+template <typename T>
+__global__ void PasteDiagonalKernel(T* out, const T* x, std::ptrdiff_t start,
+                                    std::ptrdiff_t x_length,
+                                    const std::ptrdiff_t sumStride,
+                                    const std::ptrdiff_t xStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < x_length; idx += gridDim.x * blockDim.x) {
+    const std::ptrdiff_t outOffset = start + sumStride * idx;
+    out[outOffset] = x[xStride * idx];
+  }
+}
+
+template <typename DeviceContext, typename T>
+class DiagV2CUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* x_data = X->data<T>();
+    auto x_dims = X->dims();
+    int offset = context.Attr<int>("offset");
+    auto* out = context.Output<framework::Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_dims = out->dims();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    if (x_dims.size() == 1) {
+      float padding_value = context.Attr<float>("padding_value");
+      math::SetConstant<DeviceContext, T> set_padding_value;
+      set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
+
+      auto x_length = x_dims[0];
+      auto size = (offset > 0) ? x_length + offset : x_length - offset;
+      const int& x_stride = ComputeStride(0, x_dims);
+      if (size > 0) {
+        const int block_num = std::min(static_cast<int>(size),
+                                       dev_ctx.GetMaxPhysicalThreadCount());
+        int size_ = static_cast<int>(size);
+        int block_num_ = static_cast<int>(block_num);
+        const int grid_num =
+            std::min(1024, (size_ + block_num_ - 1) / block_num_);
+        const auto& out_stride_0 = ComputeStride(0, out_dims);
+        const auto& out_stride_1 = ComputeStride(1, out_dims);
+        auto start =
+            (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
+
+        PasteDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>(
+            out_data, x_data, start, x_length, out_stride_0 + out_stride_1,
+            x_stride);
+      }
+    } else {
+      const int& x_stride_0 = ComputeStride(0, x_dims);
+      const int& x_stride_1 = ComputeStride(1, x_dims);
+
+      int size;
+      if (offset > 0) {
+        size = std::min(x_dims[0], x_dims[1] - offset);
+      } else {
+        size = std::min(x_dims[0] + offset, x_dims[1]);
+      }
+
+      if (size > 0) {
+        const int block_num = std::min(static_cast<int>(size),
+                                       dev_ctx.GetMaxPhysicalThreadCount());
+        int size_ = static_cast<int>(size);
+        int block_num_ = static_cast<int>(block_num);
+        const int grid_num =
+            std::min(1024, (size_ + block_num_ - 1) / block_num_);
+        auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
+        const auto& out_stride_0 = ComputeStride(0, out_dims);
+
+        ExtractDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>(
+            out_data, x_data, start, size, x_stride_0 + x_stride_1,
+            out_stride_0);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    diag_v2, ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/diag_v2_op.h b/paddle/fluid/operators/diag_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7850def06117ff4232afe4fca95a3e3e500e876d
--- /dev/null
+++ b/paddle/fluid/operators/diag_v2_op.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using DDim = framework::DDim;
+
+static inline int ComputeStride(int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index cff3993a068ceee1947ca3e17b9cc6a75e3c9ba9..a033611f478f9ea44fd49ab2015e78aaea6aacd9 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -61,7 +61,7 @@ cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
-cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv)
+cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator)
 cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
 if(WITH_GPU)
     cc_test(collective_server_test SRCS collective_server_test.cc 
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h
index fb7a0691154de768d4b828ee5d7b6a47755225f4..9e39e68cba779de4dc598046e45f7d35e292bb79 100644
--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ b/paddle/fluid/operators/distributed/large_scale_kv.h
@@ -14,20 +14,20 @@
 
 #pragma once
 
+#include <ThreadPool.h>
 #include <gflags/gflags.h>
 
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
 #include <string>
+#include <thread>  // NOLINT
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
-#include <thread>  // NOLINT
-
-#include <ThreadPool.h>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -88,21 +88,17 @@ class UniformInitializer : public Initializer {
     min_ = std::stof(attrs[2]);
     max_ = std::stof(attrs[3]);
 
-    if (seed_ == 0) {
-      seed_ = std::random_device()();
-    }
-
-    random_engine_.seed(seed_);
     dist_ = std::uniform_real_distribution<float>(min_, max_);
+    random_engine_ = framework::GetCPURandomEngine(seed_);
   }
 
-  float GetValue() override { return dist_(random_engine_); }
+  float GetValue() override { return dist_(*random_engine_); }
 
  private:
   float min_;
   float max_;
 
-  std::minstd_rand random_engine_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
   std::uniform_real_distribution<float> dist_;
 };
 
@@ -133,21 +129,18 @@ class GaussianInitializer : public Initializer {
     mean_ = std::stof(attrs[2]);
     std_ = std::stof(attrs[3]);
 
-    if (seed_ == 0) {
-      seed_ = std::random_device()();
-    }
+    random_engine_ = framework::GetCPURandomEngine(seed_);
 
-    random_engine_.seed(seed_);
     dist_ = std::normal_distribution<float>(mean_, std_);
   }
 
-  float GetValue() override { return dist_(random_engine_); }
+  float GetValue() override { return dist_(*random_engine_); }
 
  private:
   float std_;
   float mean_;
 
-  std::minstd_rand random_engine_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
   std::normal_distribution<float> dist_;
 };
 
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 5a67b358ddabb12566cd4ffe00cb12c65a185099..a9378d61c3ca39bd43b558633cc4d04c40175cac 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -110,7 +110,7 @@ void prefetch_core(
   int pservers = context.Attr<int>("pserver_num");
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &actual_ctx = *pool.Get(context.GetPlace());
+  auto &actual_ctx = *pool.Get(platform::CPUPlace());
 
   std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
 
@@ -144,7 +144,6 @@ void prefetch_core(
       VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
     }
   }
-
   for (size_t i = 0; i < rets.size(); i++) {
     PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
                                                "internal error in RPCClient"));
@@ -167,6 +166,7 @@ void prefetch_core(
       for (int64_t i = 0; i < dims[0]; ++i) {
         auto origin_id = ids_in_this_section[i];
         std::vector<float> vecs(row_numel);
+
         std::copy_n(out_var_data + i * row_numel, row_numel, vecs.begin());
         (*recved_vec_map)[origin_id] = vecs;
       }
@@ -213,18 +213,18 @@ void prefetchs(const std::vector<std::string> &id_var_names,
   const auto place =
       scope.FindVar(id_var_names[0])->Get<framework::LoDTensor>().place();
 
-  if (!platform::is_cpu_place(place)) {
-    PADDLE_THROW("multi prefetch only support CPU currently");
-  }
-
+  std::vector<std::vector<int64_t>> ids_group;
   std::vector<int64_t> ids_union;
+  std::vector<framework::LoD> ids_lods;
   TableAndEndpoints tables;
 
   for (auto &id_name : id_var_names) {
-    auto *in_var = scope.FindVar(id_name);
-    auto &id_tensor = in_var->Get<framework::LoDTensor>();
-    std::copy_n(id_tensor.data<int64_t>(), id_tensor.numel(),
-                back_inserter(ids_union));
+    auto &id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
+    std::vector<int64_t> ids;
+    TensorToVector(id_tensor, context.device_context(), &ids);
+    ids_union.insert(ids_union.end(), ids.begin(), ids.end());
+    ids_group.push_back(ids);
+    ids_lods.push_back(id_tensor.lod());
   }
 
   std::unordered_set<int64_t> s(ids_union.begin(), ids_union.end());
@@ -258,25 +258,48 @@ void prefetchs(const std::vector<std::string> &id_var_names,
   }
 
   for (size_t i = 0; i < out_var_names.size(); i++) {
-    auto *in_var = scope.FindVar(id_var_names[i]);
-    auto &id_tensor = in_var->Get<framework::LoDTensor>();
-    auto ids_size = id_tensor.dims()[0];
-    const auto *id_data = id_tensor.data<int64_t>();
-
+    std::vector<int64_t> ids = ids_group[i];
+    auto ids_size = ids.size();
     auto *out_t =
         scope.FindVar(out_var_names[i])->GetMutable<framework::LoDTensor>();
-    out_t->set_lod(id_tensor.lod());
-    out_t->Resize(framework::make_ddim({ids_size, vec_dim_1}));
+    out_t->set_lod(ids_lods[i]);
+    out_t->Resize(
+        framework::make_ddim({static_cast<int64_t>(ids_size), vec_dim_1}));
     auto *out_d = out_t->mutable_data<float>(place);
 
-    for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
-      const auto &id = id_data[idx];
-      if (padding_idx != distributed::kNoPadding && id == padding_idx) {
-        memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1);
-      } else {
-        std::copy_n(recved_vec_map[id].begin(), vec_dim_1,
-                    out_d + idx * vec_dim_1);
+    if (platform::is_cpu_place(out_t->place())) {
+      for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
+        const auto &id = ids[idx];
+        if (padding_idx != distributed::kNoPadding && id == padding_idx) {
+          memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1);
+        } else {
+          std::copy_n(recved_vec_map[id].begin(), vec_dim_1,
+                      out_d + idx * vec_dim_1);
+        }
+      }
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
+        const auto &id = ids[idx];
+        auto stream = context.cuda_device_context().stream();
+        if (padding_idx != distributed::kNoPadding && id == padding_idx) {
+          platform::GpuMemsetAsync(out_d + idx * vec_dim_1, 0,
+                                   sizeof(float) * vec_dim_1, stream);
+        } else {
+          auto &cpu_place =
+              BOOST_GET_CONST(platform::CPUPlace,
+                              paddle::platform::CPUDeviceContext().GetPlace());
+          auto &gpu_place =
+              BOOST_GET_CONST(platform::CUDAPlace, out_t->place());
+          memory::Copy(gpu_place, out_d + idx * vec_dim_1, cpu_place,
+                       &recved_vec_map[id][0], sizeof(float) * vec_dim_1,
+                       stream);
+        }
       }
+#else
+      PADDLE_ENFORCE(true, platform::errors::PermissionDenied(
+                               "Paddle is not compiled with GPU!"));
+#endif
     }
   }
 }
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
index 3037a63b0d7b4e8812e67fdfb776f89ea43eb546..8c093d12585981ee681ae13f0d2e493197c6b9b3 100644
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+#include "paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
@@ -75,47 +73,6 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename T>
-class DistributedLookupTableKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto ids_vars = context.MultiInputVar("Ids");
-    auto emb_vars = context.MultiOutput<framework::Tensor>("Embeddings");
-
-    auto id_names = context.InputNames("Ids");
-    auto embedding_name = context.InputNames("W").front();
-    auto out_names = context.OutputNames("Outputs");
-    auto lookup_tables = context.Attr<std::vector<std::string>>("table_names");
-    auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
-    auto is_distributed = context.Attr<bool>("is_distributed");
-
-    auto lookup_table_version =
-        context.Attr<std::string>("lookup_table_version");
-
-    operators::distributed::prefetchs(id_names, out_names, embedding_name,
-                                      is_distributed, lookup_tables, endpoints,
-                                      context, context.scope());
-
-    if (lookup_table_version == "lookup_table_v2") {
-      auto &scope = context.scope();
-      auto emb_dim =
-          scope.FindVar(embedding_name)->Get<framework::LoDTensor>().dims()[1];
-
-      for (size_t i = 0; i < id_names.size(); ++i) {
-        auto *id_var = scope.FindVar(id_names[i]);
-        auto *out_var = scope.FindVar(out_names[i]);
-        auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
-        auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
-
-        auto id_dims = id_tensor->dims();
-        out_tensor->Resize(framework::make_ddim(
-            {static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
-             static_cast<int64_t>(emb_dim)}));
-      }
-    }
-  }
-};
-
 class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -170,15 +127,12 @@ class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
 Lookup Tablel Prefetch Operator.
-
 This operator is used to perform lookup on parameter W,
 then concatenated into a sparse tensor.
-
 The type of Ids(Input) is SelectedRows, the rows of Ids contains
 the ids to be looked up in W;
 if the Id is not in the sparse table, this operator will return a
 random value and set the value into the table for the next looking up.
-
 )DOC");
   }
 };
@@ -191,4 +145,5 @@ REGISTER_OPERATOR(distributed_lookup_table, ops::DistributedLookupTableOp,
                   ops::DistributedLookupTableOpMaker);
 
 REGISTER_OP_CPU_KERNEL(distributed_lookup_table,
-                       ops::DistributedLookupTableKernel<float>);
+                       ops::DistributedLookupTableKernel<
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54c894415096e869f363eda6a1de2a473e839263
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. */
+
+#include "paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    distributed_lookup_table,
+    ops::DistributedLookupTableKernel<plat::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a71451c78a870b71c05b41bdcfb34a85b3e2213b
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class DistributedLookupTableKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto ids_vars = context.MultiInputVar("Ids");
+    auto emb_vars = context.MultiOutput<framework::Tensor>("Embeddings");
+
+    auto id_names = context.InputNames("Ids");
+    auto embedding_name = context.InputNames("W").front();
+    auto out_names = context.OutputNames("Outputs");
+    auto lookup_tables = context.Attr<std::vector<std::string>>("table_names");
+    auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
+    auto is_distributed = context.Attr<bool>("is_distributed");
+
+    operators::distributed::prefetchs(id_names, out_names, embedding_name,
+                                      is_distributed, lookup_tables, endpoints,
+                                      context, context.scope());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/recv_save_op.cc b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
index ccc30d1ea082a6f69b71059631247144c931116e..d194fcda36a474fa208f5d5a67e425ba5a5a3303 100644
--- a/paddle/fluid/operators/distributed_ops/recv_save_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
@@ -44,7 +44,7 @@ class RecvSaveOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
+        platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 2580b00d7c2bdfae726b924bb51de199586e12c3..cec706300d77b2b0e66e5b682dfad8536b5dc401 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -26,6 +26,86 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename DeviceContext, typename T>
+void DotGradFunction(const Tensor* tensor_x, const Tensor* tensor_y,
+                     const Tensor* tensor_dout, Tensor* tensor_dx,
+                     Tensor* tensor_dy,
+                     const paddle::framework::ExecutionContext& ctx) {
+#ifdef __NVCC__
+  if (1 == tensor_dout->dims().size()) {
+    auto dout = framework::EigenVector<T>::Flatten(*tensor_dout);
+
+    if (tensor_dx) {
+      auto y = framework::EigenVector<T>::Flatten(*tensor_y);
+      auto dx = framework::EigenVector<T>::Flatten(*tensor_dx);
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      Eigen::DSizes<int, 1> size(tensor_dx->numel());
+      dx.device(dev) = y * dout.broadcast(size);
+    }
+
+    if (tensor_dy) {
+      auto x = framework::EigenVector<T>::Flatten(*tensor_x);
+      auto dy = framework::EigenVector<T>::Flatten(*tensor_dy);
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      Eigen::DSizes<int, 1> size(tensor_dy->numel());
+      dy.device(dev) = x * dout.broadcast(size);
+    }
+  } else {
+    auto dout = EigenMatrix<T>::From(*tensor_dout);
+
+    if (tensor_dx) {
+      tensor_dx->mutable_data<T>(ctx.GetPlace());
+      auto y = EigenMatrix<T>::From(*tensor_y);
+      auto dx = EigenMatrix<T>::From(*tensor_dx);
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
+      dx.device(dev) = y * dout.broadcast(size);
+    }
+
+    if (tensor_dy) {
+      tensor_dy->mutable_data<T>(ctx.GetPlace());
+      auto x = EigenMatrix<T>::From(*tensor_x);
+      auto dy = EigenMatrix<T>::From(*tensor_dy);
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
+      dy.device(dev) = x * dout.broadcast(size);
+    }
+  }
+#else
+  const auto* data_dout = tensor_dout->data<T>();
+
+  if (tensor_dx) {
+    auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
+    const auto* data_y = tensor_y->data<T>();
+    const framework::DDim& dim = tensor_x->dims();
+    size_t N = static_cast<size_t>(framework::product(dim));
+
+    auto step = dim[dim.size() - 1];
+
+    int s = -1;
+    for (size_t i = 0; i < N; ++i) {
+      if (0 == i % step) ++s;
+      data_dx[i] = data_y[i] * data_dout[s];
+    }
+  }
+
+  if (tensor_dy) {
+    auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
+    const auto* data_x = tensor_x->data<T>();
+    const framework::DDim& dim = tensor_y->dims();
+    size_t N = static_cast<size_t>(framework::product(dim));
+
+    auto step = dim[dim.size() - 1];
+
+    int s = -1;
+    for (size_t i = 0; i < N; ++i) {
+      if (0 == i % step) ++s;
+      data_dy[i] = data_x[i] * data_dout[s];
+    }
+  }
+#endif
+}
+
 template <typename DeviceContext, typename T>
 class DotKernel : public framework::OpKernel<T> {
  public:
@@ -84,83 +164,9 @@ class DotGradKernel : public framework::OpKernel<T> {
 
     if (tensor_dx) tensor_dx->mutable_data<T>(ctx.GetPlace());
     if (tensor_dy) tensor_dy->mutable_data<T>(ctx.GetPlace());
-#ifdef __NVCC__
-    if (1 == tensor_dout->dims().size()) {
-      auto dout = framework::EigenVector<T>::Flatten(*tensor_dout);
-
-      if (tensor_dx) {
-        auto y = framework::EigenVector<T>::Flatten(*tensor_y);
-        auto dx = framework::EigenVector<T>::Flatten(*tensor_dx);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 1> size(tensor_dx->numel());
-        dx.device(dev) = y * dout.broadcast(size);
-      }
-
-      if (tensor_dy) {
-        auto x = framework::EigenVector<T>::Flatten(*tensor_x);
-        auto dy = framework::EigenVector<T>::Flatten(*tensor_dy);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 1> size(tensor_dy->numel());
-        dy.device(dev) = x * dout.broadcast(size);
-      }
-    } else {
-      auto dout = EigenMatrix<T>::From(*tensor_dout);
-
-      if (tensor_dx) {
-        tensor_dx->mutable_data<T>(ctx.GetPlace());
-        auto y = EigenMatrix<T>::From(*tensor_y);
-        auto dx = EigenMatrix<T>::From(*tensor_dx);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
-        dx.device(dev) = y * dout.broadcast(size);
-      }
-
-      if (tensor_dy) {
-        tensor_dy->mutable_data<T>(ctx.GetPlace());
-        auto x = EigenMatrix<T>::From(*tensor_x);
-        auto dy = EigenMatrix<T>::From(*tensor_dy);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
-        dy.device(dev) = x * dout.broadcast(size);
-      }
-    }
-#else
-    const auto* data_dout = tensor_dout->data<T>();
-
-    if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
-      const auto* data_y = tensor_y->data<T>();
-      const framework::DDim& dim = tensor_x->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
 
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dx[i] = data_y[i] * data_dout[s];
-      }
-    }
-
-    if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
-      const auto* data_x = tensor_x->data<T>();
-      const framework::DDim& dim = tensor_y->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
-
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dy[i] = data_x[i] * data_dout[s];
-      }
-    }
-#endif
+    DotGradFunction<DeviceContext, T>(tensor_x, tensor_y, tensor_dout,
+                                      tensor_dx, tensor_dy, ctx);
   }
 };
 
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index 676361289e888a5bbd71b63ff16a1ddd7e5dad51..9d9eb4a82a075f27764a73d0e976dbf3f7181cb1 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -54,24 +55,22 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
         std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
         return;
       }
-
+      // std::minstd_rand engine;
       // NOTE: fixed seed should only be used in unittest or for debug.
       // Guarantee to use random seed in training.
-      std::random_device rnd;
-      std::minstd_rand engine;
-      int seed_data;
+      int seed_data = 0;
       if (seed) {
         seed_data = *(seed->data<int>());
       } else {
         seed_data =
-            context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+            context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : 0;
       }
-      engine.seed(seed_data);
+      auto engine = framework::GetCPURandomEngine(seed_data);
 
       std::uniform_real_distribution<float> dist(0, 1);
 
       for (size_t i = 0; i < size; ++i) {
-        if (dist(engine) < dropout_prob) {
+        if (dist(*engine) < dropout_prob) {
           mask_data[i] = 0;
           y_data[i] = 0;
         } else {
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
index 5a398fa50febe2efffd588ce8f3612f1f9cec0b6..457d9e79d7da171ef526d5cab0e59b021cb64f98 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -49,6 +49,8 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
                                    int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
index 60846d1e8fee1c7f68ac101f18355750c2c15a4d..f63d6f037632c1a6a05726b933b2258adc113ee3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
@@ -19,5 +19,7 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, double>,
     ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
index 2d24e394d5c823dbd22c837210e46cefeceba1be..8afe2133c0488bbe04ec4803aac5dce6573f634d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <math.h>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
@@ -24,7 +25,36 @@ namespace operators {
 
 template <typename T>
 struct FloorDivFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+#ifdef __CUDA_ARCH__
+    if (b == 0) {
+      printf("Error: Divide by zero encounter in floor_divide\n");
+      asm("trap;");
+    }
+#else
+    if (b == 0)
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Divide by zero encounter in floor_divide"));
+#endif
+    return static_cast<T>(std::trunc(a / b));
+  }
+};
+
+template <typename T>
+struct InverseFloorDivFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+#ifdef __CUDA_ARCH__
+    if (a == 0) {
+      printf("Error: Divide by zero encounter in floor_divide\n");
+      asm("trap;");
+    }
+#else
+    if (a == 0)
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Divide by zero encounter in floor_divide"));
+#endif
+    return static_cast<T>(std::trunc(b / a));
+  }
 };
 
 template <typename DeviceContext, typename T>
@@ -32,8 +62,15 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
                            const framework::Tensor *x,
                            const framework::Tensor *y, framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
-      ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
+  }
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
index af80666b9542db1073f3b07618433671652fffa4..8c2e62bed195f27e228d5dd460ba21ed87c3f5d2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
@@ -25,14 +25,14 @@ class ElementwiseModOpMaker : public ElementwiseOpMaker {
 
   void AddInputX() override {
     AddInput("X",
-             "(Variable), Tensor or LoDTensor of any dimensions. Its dtype "
-             "should be int32, int64.");
+             "(Tensor), Tensor of any dimensions. Its dtype "
+             "should be int32, int64, float32 or float64.");
   }
 
   void AddInputY() override {
     AddInput("Y",
-             "(Variable), Tensor or LoDTensor of any dimensions. Its dtype "
-             "should be int32, int64.");
+             "(Tensor), Tensor of any dimensions. Its dtype "
+             "should be int32, int64, float32 or float64.");
   }
 
   std::string GetOpFuntionality() const override {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
index 4306a471b76c5bd4f0a5284052d7d39aa5fbc279..47bd6af0b95ace2b9b753e38cfc5f191bc1bb942 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
@@ -24,13 +24,19 @@ namespace operators {
 
 template <typename T>
 struct ModFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a % b; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    T res = a % b;
+    if ((res != 0) && ((res < 0) != (b < 0))) res += b;
+    return res;
+  }
 };
 
 template <typename T>
 struct ModFunctorFP {
   inline HOSTDEVICE T operator()(T a, T b) const {
-    return fmod(b + fmod(a, b), b);
+    T res = fmod(a, b);
+    if ((res != 0) && ((b < 0) != (res < 0))) res += b;
+    return res;
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.h b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
old mode 100644
new mode 100755
index ff55d2f2040a17c32720df08c1ac0b00cc1d7a02..a910c326196bc61758c3be7db3b8ac5d85b0095c
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
@@ -22,15 +22,20 @@ namespace operators {
 template <typename T>
 struct PowFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const {
-#ifdef __CUDA_ARCH__
-    // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
-    // it will return a float number like 2.99... , which floor to 2
-    // when cast to int by default and it is wrong.
-    // Use llrint to cast it to the nearest integer, which is 3.
+    // TODO(wujionghao): A potential speed improvement is supporting different
+    // types in C++.
+    // #ifdef __CUDA_ARCH__
+    //     // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
+    //     // it will return a float number like 2.99... , which floor to 2
+    //     // when cast to int by default and it is wrong.
+    //     // Use llrint to cast it to the nearest integer, which is 3.
+    //     if (std::is_integral<T>::value) {
+    //       return std::llrint(std::pow(a, b));
+    //     }
+    // #endif
     if (std::is_integral<T>::value) {
       return std::llrint(std::pow(a, b));
     }
-#endif
     return std::pow(a, b);
   }
 };
diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc
index 2cf08e5c3409acb6b8a43058e143df55fd563d74..793519b40182114c13e63dd32caaa382d55fa52d 100644
--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
@@ -83,7 +83,6 @@ Return an identity tensor whose shape is [num_rows, num_columns].
 
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
-using float16 = paddle::platform::float16;
 
 REGISTER_OPERATOR(
     eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
@@ -93,4 +92,4 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(eye, ops::EyeKernel<CPU, float>,
                        ops::EyeKernel<CPU, double>,
                        ops::EyeKernel<CPU, int64_t>, ops::EyeKernel<CPU, int>,
-                       ops::EyeKernel<CPU, float16>);
+                       ops::EyeKernel<CPU, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index f59d46ec79bd0960392ed1b8b3c8ee27b2317e39..c4bdd9e439c54db03f8fa8c4fe439ed6edbd0c7a 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -158,5 +159,133 @@ void GPUGatherNd(const framework::ExecutionContext& context,
       end_size);
 }
 
+template <typename T, typename U>
+__global__ void GatherGPUKernel(const T* input, const U* index, T* out,
+                                int outer_dim_size, int inner_dim_size,
+                                int out_index_dim_size,
+                                int input_index_dim_size, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    int inner_dim_index = idx / (outer_dim_size * out_index_dim_size);
+    int next_idx = idx % (outer_dim_size * out_index_dim_size);
+    int index_dim_index = next_idx / (outer_dim_size);
+    int out_dim_index = next_idx % outer_dim_size;
+    int input_index =
+        inner_dim_index * (outer_dim_size * input_index_dim_size) +
+        index[index_dim_index] * outer_dim_size + out_dim_index;
+    out[idx] = input[input_index];
+  }
+}
+
+template <typename T, typename U>
+__global__ void GatherGradGPUKernel(const T* input, const U* index, T* out,
+                                    int outer_dim_size, int inner_dim_size,
+                                    int input_index_dim_size,
+                                    int out_index_dim_size, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    int inner_dim_index = idx / (outer_dim_size * input_index_dim_size);
+    int next_idx = idx % (outer_dim_size * input_index_dim_size);
+    int index_dim_index = next_idx / (outer_dim_size);
+    int out_dim_index = next_idx % outer_dim_size;
+    int out_index = inner_dim_index * (outer_dim_size * out_index_dim_size) +
+                    index[index_dim_index] * outer_dim_size + out_dim_index;
+    paddle::platform::CudaAtomicAdd(out + out_index, *(input + idx));
+  }
+}
+
+template <typename T, typename U, typename V>
+void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
+                          const Tensor* axis, Tensor* out,
+                          const paddle::platform::Place& place,
+                          const framework::ExecutionContext& ctx) {
+  int axis_size = axis->numel();
+  int index_size = index->numel();
+  int input_size = input->numel();
+  auto input_dim = input->dims();
+  auto* input_data = input->data<T>();
+  auto* index_data = index->data<U>();
+
+  if (input->numel() == 0) return;
+  PADDLE_ENFORCE_EQ(axis_size, 1,
+                    platform::errors::InvalidArgument(
+                        "Axis size should be 1, but received %d", axis_size));
+  Tensor cpu_axis;
+  framework::TensorCopy(*axis, platform::CPUPlace(), &cpu_axis);
+  int axis_index = cpu_axis.data<V>()[0];
+  int index_dim_size = input_dim[axis_index];
+
+  int inner_dim_size = 1;
+  int outer_dim_size = 1;
+  std::vector<int> out_dim_vec;
+
+  for (int i = 0; i < axis_index; i++) {
+    inner_dim_size *= input_dim[i];
+    out_dim_vec.push_back(input_dim[i]);
+  }
+  out_dim_vec.push_back(index_size);
+  for (int i = axis_index + 1; i < input_dim.size(); i++) {
+    outer_dim_size *= input_dim[i];
+    out_dim_vec.push_back(input_dim[i]);
+  }
+  auto out_dim = framework::make_ddim(out_dim_vec);
+
+  out->Resize(out_dim);
+  auto* out_data = out->mutable_data<T>(place);
+  int out_size = out->numel();
+
+  int threads = 512;
+  int grid = (out_size + threads - 1) / threads;
+  auto stream = ctx.cuda_device_context().stream();
+  GatherGPUKernel<T, U><<<grid, threads, 0, stream>>>(
+      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
+      index_size, index_dim_size, out_size);
+}
+
+template <typename T, typename U, typename V>
+void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
+                              const Tensor* axis, Tensor* out,
+                              const paddle::platform::Place& place,
+                              const framework::ExecutionContext& ctx) {
+  auto* index_data = index->data<U>();
+
+  int axis_size = axis->numel();
+  int index_size = index->numel();
+  int input_size = input->numel();
+  auto input_dim = input->dims();
+  auto* input_data = input->data<T>();
+
+  if (input->numel() == 0) return;
+  PADDLE_ENFORCE_EQ(axis_size, 1,
+                    platform::errors::InvalidArgument(
+                        "Axis size should be 1, but received %d", axis_size));
+  Tensor cpu_axis;
+  framework::TensorCopy(*axis, platform::CPUPlace(), &cpu_axis);
+  int axis_index = cpu_axis.data<V>()[0];
+  int input_index_dim_size = input_dim[axis_index];
+
+  int inner_dim_size = 1;
+  int outer_dim_size = 1;
+
+  for (int i = 0; i < axis_index; i++) {
+    inner_dim_size *= input_dim[i];
+  }
+  for (int i = axis_index + 1; i < input_dim.size(); i++) {
+    outer_dim_size *= input_dim[i];
+  }
+
+  auto* out_data = out->mutable_data<T>(place);
+  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto out_dim = out->dims();
+  int out_index_dim_size = out_dim[axis_index];
+  operators::math::set_constant(*dev_ctx, out, 0.0);
+
+  int threads = 512;
+  int grid = (input_size + threads - 1) / threads;
+  auto stream = ctx.cuda_device_context().stream();
+  GatherGradGPUKernel<T, U><<<grid, threads, 0, stream>>>(
+      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
+      input_index_dim_size, out_index_dim_size, input_size);
+}
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h
index f5a7bffe4745360a307a4b7c61b30c871cf6c756..c12a3b8adc97893f523b307a56c0e6b04ea8d675 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -15,10 +15,12 @@ limitations under the License. */
 #pragma once
 #include <memory.h>
 #include <cstring>
+#include <vector>
 
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -124,5 +126,110 @@ void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
   }
 }
 
+template <typename T, typename U, typename V>
+void GatherV2Function(const Tensor* input, const Tensor* index,
+                      const Tensor* axis, Tensor* out,
+                      const paddle::platform::Place& place) {
+  auto* axis_data = axis->data<V>();
+  auto* index_data = index->data<U>();
+
+  int axis_size = axis->numel();
+  int index_size = index->numel();
+  int input_size = input->numel();
+  auto input_dim = input->dims();
+  auto* input_data = input->data<T>();
+
+  if (input->numel() == 0) return;
+  PADDLE_ENFORCE_EQ(axis_size, 1,
+                    platform::errors::InvalidArgument(
+                        "Axis size should be 1, but received %d", axis_size));
+  int axis_index = axis_data[0];
+
+  int input_index_dim_size = input_dim[axis_index];
+  for (int i = 0; i < index_size; i++) {
+    PADDLE_ENFORCE_LT(index_data[i], input_index_dim_size,
+                      platform::errors::InvalidArgument(
+                          "The element of Index must be less than the size of "
+                          "input dim size of axis which is %d, but received "
+                          "index element which is %d in the %d index.",
+                          input_index_dim_size, index_data[i], i));
+  }
+
+  int inner_dim_size = 1;
+  int outer_dim_size = 1;
+  std::vector<int> out_dim_vec;
+
+  for (int i = 0; i < axis_index; i++) {
+    inner_dim_size *= input_dim[i];
+    out_dim_vec.push_back(input_dim[i]);
+  }
+  out_dim_vec.push_back(index_size);
+  for (int i = axis_index + 1; i < input_dim.size(); i++) {
+    outer_dim_size *= input_dim[i];
+    out_dim_vec.push_back(input_dim[i]);
+  }
+  auto out_dim = framework::make_ddim(out_dim_vec);
+
+  out->Resize(out_dim);
+  auto* out_data = out->mutable_data<T>(place);
+
+  int out_index = 0;
+  for (int i = 0; i < inner_dim_size; i++) {
+    for (int j = 0; j < index_size; j++) {
+      for (int k = 0; k < outer_dim_size; k++) {
+        int index = k + index_data[j] * outer_dim_size +
+                    (i * input_size / inner_dim_size);
+        out_data[out_index] = input_data[index];
+        out_index++;
+      }
+    }
+  }
+}
+
+template <typename T, typename U, typename V>
+void GatherV2GradFunction(const Tensor* input, const Tensor* index,
+                          const Tensor* axis, Tensor* out,
+                          const paddle::platform::Place& place) {
+  auto* axis_data = axis->data<V>();
+  auto* index_data = index->data<U>();
+
+  int axis_size = axis->numel();
+  auto input_dim = input->dims();
+  auto* input_data = input->data<T>();
+
+  if (input->numel() == 0) return;
+  PADDLE_ENFORCE_EQ(axis_size, 1,
+                    platform::errors::InvalidArgument(
+                        "Axis size should be 1, but received %d", axis_size));
+  int axis_index = axis_data[0];
+  int input_index_dim_size = input_dim[axis_index];
+
+  int inner_dim_size = 1;
+  int outer_dim_size = 1;
+
+  for (int i = 0; i < axis_index; i++) {
+    inner_dim_size *= input_dim[i];
+  }
+  for (int i = axis_index + 1; i < input_dim.size(); i++) {
+    outer_dim_size *= input_dim[i];
+  }
+
+  auto* out_data = out->mutable_data<T>(place);
+  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto out_dim = out->dims();
+  int out_index_dim_size = out_dim[axis_index];
+  operators::math::set_constant(*dev_ctx, out, 0.0);
+
+  for (int i = 0; i < inner_dim_size; i++) {
+    for (int j = 0; j < input_index_dim_size; j++) {
+      for (int k = 0; k < outer_dim_size; k++) {
+        int index = k + index_data[j] * outer_dim_size +
+                    i * outer_dim_size * out_index_dim_size;
+        out_data[index] += input_data[j * outer_dim_size + k];
+      }
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
index c22c8a18ca63a05265ac6991cf0e0cbd9e7ea5ed..1427bd04d3442be26be931ca31bf358ebd23efae 100644
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
@@ -45,7 +45,7 @@ class GatherNdOp : public framework::OperatorWithKernel {
         index_dims[index_dims_size - 1], x_dims_size,
         platform::errors::InvalidArgument(
             "Input(Index).shape[-1] should be no greater than Input(X).rank"));
-    PADDLE_ENFORCE_GE(index_dims_size, 2UL,
+    PADDLE_ENFORCE_GE(index_dims_size, 1UL,
                       platform::errors::InvalidArgument(
                           "The rank of Input(Index) should be greater than 1"));
 
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 6a3abaa600281ac4a9762d5c73d398974abbf041..28afeb6f541c68fe7e0719a782fd8c9147b15163 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
-
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace operators {
 
@@ -78,6 +78,9 @@ class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "The source input of gather op");
     AddInput("Index", "The index input of gather op");
+    AddInput("Axis",
+             "The Tensor which contains the axis that we do gather operation.")
+        .AsDispensable();
     AddOutput("Out", "The output of gather op");
     AddAttr<bool>(
         "overwrite",
@@ -120,6 +123,8 @@ class GatherGradOpMaker : public framework::SingleGradOpMaker<T> {
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("gather_grad");
     op->SetInput("Index", this->Input("Index"));
+    op->SetInput("Axis", this->Input("Axis"));
+
     op->SetInput("X", this->Input("X"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
@@ -147,3 +152,7 @@ REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
                        ops::GatherGradientOpKernel<int>,
                        ops::GatherGradientOpKernel<uint8_t>,
                        ops::GatherGradientOpKernel<int64_t>);
+REGISTER_OP_VERSION(gather)
+    .AddCheckpoint(R"ROC(upgrad gather, add attribut [axis])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewAttr(
+                       "axis", "Specify the axis of gather operation.", {}));
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 5bef547c0542b922f646f72ffb7310ef4eb279e9..37fbfb21f60a0568390c6798dc305c91fc8af886 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -31,6 +31,33 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
     auto *index = ctx.Input<Tensor>("Index");
     auto *output = ctx.Output<Tensor>("Out");
 
+    if (ctx.HasInput("Axis")) {
+      const Tensor *axis = ctx.Input<Tensor>("Axis");
+      const auto &index_type = index->type();
+      const auto &axis_type = axis->type();
+      auto place = ctx.GetPlace();
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2CUDAFunction<T, int32_t, int32_t>(x, index, axis, output, place,
+                                                  ctx);
+      }
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2CUDAFunction<T, int32_t, int64_t>(x, index, axis, output, place,
+                                                  ctx);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2CUDAFunction<T, int64_t, int32_t>(x, index, axis, output, place,
+                                                  ctx);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2CUDAFunction<T, int64_t, int64_t>(x, index, axis, output, place,
+                                                  ctx);
+      }
+      return;
+    }
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
     const auto &index_type = index->type();
@@ -64,6 +91,34 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
+    if (ctx.HasInput("Axis")) {
+      const Tensor *axis = ctx.Input<Tensor>("Axis");
+      const auto &index_type = index->type();
+      const auto &axis_type = axis->type();
+      auto place = ctx.GetPlace();
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2GradCUDAFunction<T, int32_t, int32_t>(dO, index, axis, dX,
+                                                      place, ctx);
+      }
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2GradCUDAFunction<T, int32_t, int64_t>(dO, index, axis, dX,
+                                                      place, ctx);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2GradCUDAFunction<T, int64_t, int32_t>(dO, index, axis, dX,
+                                                      place, ctx);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2GradCUDAFunction<T, int64_t, int64_t>(dO, index, axis, dX,
+                                                      place, ctx);
+      }
+      return;
+    }
+
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
     auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index e4ce13ca8fc0b49e997749d0f47f15213a3b44f7..8ec0d6ce0b69c791f9bff58f1681f8d4543c57dd 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -35,6 +35,30 @@ class GatherOpKernel : public framework::OpKernel<T> {
     auto *index = ctx.Input<Tensor>("Index");
     auto *output = ctx.Output<Tensor>("Out");
 
+    if (ctx.HasInput("Axis")) {
+      const Tensor *axis = ctx.Input<Tensor>("Axis");
+      const auto &index_type = index->type();
+      const auto &axis_type = axis->type();
+      auto place = ctx.GetPlace();
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2Function<T, int32_t, int32_t>(x, index, axis, output, place);
+      }
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2Function<T, int32_t, int64_t>(x, index, axis, output, place);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2Function<T, int64_t, int32_t>(x, index, axis, output, place);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2Function<T, int64_t, int64_t>(x, index, axis, output, place);
+      }
+      return;
+    }
+
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
 
@@ -70,6 +94,30 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
+    if (ctx.HasInput("Axis")) {
+      const Tensor *axis = ctx.Input<Tensor>("Axis");
+      const auto &index_type = index->type();
+      const auto &axis_type = axis->type();
+      auto place = ctx.GetPlace();
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2GradFunction<T, int32_t, int32_t>(dO, index, axis, dX, place);
+      }
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2GradFunction<T, int32_t, int64_t>(dO, index, axis, dX, place);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2GradFunction<T, int64_t, int32_t>(dO, index, axis, dX, place);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2GradFunction<T, int64_t, int64_t>(dO, index, axis, dX, place);
+      }
+      return;
+    }
+
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
     auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 898c063afdd43ce2cb81d10b3e0ba6fa3bf18859..4f128463375b91803a7a4d02a27dd78157961aac 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <random>
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 #ifdef PADDLE_WITH_MKLDNN
@@ -31,26 +32,21 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
     float mean = context.Attr<float>("mean");
     float std = context.Attr<float>("std");
     auto* tensor = context.Output<framework::Tensor>("Out");
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
 
     std::normal_distribution<T> dist(mean, std);
-
     const std::string op_type = "gaussian_random";
     auto shape = GetShape(context, op_type);
     tensor->Resize(shape);
     int64_t size = tensor->numel();
     T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
 
     for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
+      data[i] = dist(*engine);
     }
   }
-};
+};  // namespace operators
 
 template <typename T>
 class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index 3bf34fc685ee8af39b66f444c35d606c4b5d8ffb..93f9e108723fbd56e0d3bf5d439614c2c20bb393 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -41,13 +41,14 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
 
     int n = input->dims()[0];
     int c = input->dims()[1];
-    int h = input->dims()[2];
-    int w = input->dims()[3];
-    const int size[4] = {n, c, h, w};
+    int out_h = grid->dims()[1];
+    int out_w = grid->dims()[2];
+    const int size[4] = {n, c, out_h, out_w};
 
     const T* input_data = input->data<T>();
     const T* grid_data = grid->data<T>();
-    T* output_data = output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    T* output_data =
+        output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
 
     ScopedSpatialTransformerDescriptor st_desc;
     cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
@@ -97,7 +98,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
     const T* grid_data = grid->data<T>();
     const T* output_grad_data = output_grad->data<T>();
     T* input_grad_data =
-        input_grad->mutable_data<T>(output_grad_dims, ctx.GetPlace());
+        input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
     T* grid_grad_data =
         grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 5be490379642e8761a6821fa0dc0d332ca5b41ef..deb71b807128e5c0b173b517e60832894ced41e5 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/grid_sampler_op.h"
 #include <memory>
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
@@ -58,21 +59,10 @@ class GridSampleOp : public framework::OperatorWithKernel {
               "Input(X) and Input(Grid) dimension[0] should be equal, but "
               "received X dimension[0](%d) != Grid dimension[0](%d)",
               x_dims[0], grid_dims[0]));
-      PADDLE_ENFORCE_EQ(
-          grid_dims[1], x_dims[2],
-          platform::errors::InvalidArgument(
-              "Input(X) dims[2] and Input(Grid) dims[1] should be equal, but "
-              "received X dimension[2](%d) != Grid dimension[1](%d)",
-              x_dims[2], grid_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          grid_dims[2], x_dims[3],
-          platform::errors::InvalidArgument(
-              "Input(X) dims[3] and Input(Grid) dims[2] should be equal, but "
-              "received X dimension[3](%d) != Grid dimension[2](%d)",
-              x_dims[3], grid_dims[2]));
     }
 
-    ctx->SetOutputDim("Output", x_dims);
+    ctx->SetOutputDim("Output",
+                      {x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
     ctx->ShareLoD("X", "Output");
   }
 
@@ -108,15 +98,37 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
         "(bool, default true) Only used in cudnn kernel, need install cudnn")
         .SetDefault(true);
 
+    AddAttr<bool>(
+        "align_corners",
+        "(bool, default true) If align_corners is true, it will project"
+        "-1 and 1 to the centers of the corner pixels. Otherwise, it will "
+        "project"
+        "-1 and 1 to the image edges.")
+        .SetDefault(true);
+
+    AddAttr<std::string>(
+        "mode",
+        "(bool, default true) The interpolation method which can be 'bilinear'"
+        " or 'nearest'.")
+        .SetDefault("bilinear");
+
+    AddAttr<std::string>(
+        "padding_mode",
+        "(bool, default true) The padding method used when source"
+        "index is out of input images. It can be 'zeros', 'reflect' and "
+        "'border'.")
+        .SetDefault("zeros");
+
     AddComment(R"DOC(
-      This operation samples input X by using bilinear interpolation based on 
+      This operation samples input X by using bilinear or nearest interpolation based on 
       flow field grid, which is usually generated by affine_grid. The grid of
       shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
       with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
       (in width dimension) of input data x and grid_y is indexing the 3rd 
       dimension (in height dimension), finally results is the bilinear 
-      interpolation value of 4 nearest corner points.
+      interpolation value or nearest value of 4 nearest corner points.
 
+      For bilinear interpolation mode:
       Step 1:
         Get (x, y) grid coordinates and scale to [0, H-1/W-1].
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..999f990448ca6370dadacbdaee5bf3bcadcaca0e
--- /dev/null
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -0,0 +1,490 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/grid_sampler_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename T>
+static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
+                                                  int sW, int H, int W,
+                                                  T delta) {
+  if (in_bounds(h, w, H, W)) {
+    platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T _unnormalize(T coord, int size,
+                                                 bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T clip_indexes(T in, int max_value) {
+  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
+}
+
+template <typename T>
+static __forceinline__ __device__ T reflect_indexes(T in, int twice_low,
+                                                    int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<T>(0);
+  }
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = fabs(in - min);
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T compute_positions(T coord, int size,
+                                                      PaddingMode padding_mode,
+                                                      bool align_corners) {
+  coord = _unnormalize<T>(coord, size, align_corners);
+  if (padding_mode == PaddingMode::border) {
+    coord = clip_indexes(coord, size - 1);
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = reflect_indexes(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_indexes(coord, -1, 2 * size - 1);
+    }
+    coord = clip_indexes(coord, size - 1);
+  }
+  return coord;
+}
+
+template <typename T>
+static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size,
+                                                           bool align_corners,
+                                                           T* grad_in) {
+  if (align_corners) {
+    *grad_in = static_cast<T>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    *grad_in = static_cast<T>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit,
+                                                           T* grad_in) {
+  if (in <= static_cast<T>(0)) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  } else {
+    T max = static_cast<T>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<T>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<T>(1);
+      return in;
+    }
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  }
+  int grad_in_mult_;
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<T>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<T>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<T>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+compute_positions_with_mask(T coord, int size, PaddingMode padding_mode,
+                            bool align_corners, T* grad_in) {
+  T grad_clip, grad_refl;
+  coord = _unnormalize_with_mask<T>(coord, size, align_corners, grad_in);
+  if (padding_mode == PaddingMode::border) {
+    coord = clip_indexes_with_mask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl);
+    } else {
+      coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl);
+    }
+    coord = clip_indexes_with_mask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+
+  return coord;
+}
+
+template <typename T>
+__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
+                                        int out_h, int out_w, int in_h,
+                                        int in_w, const T* input, const T* grid,
+                                        T* output, const Mode mode,
+                                        const PaddingMode padding_mode,
+                                        bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+  int out_sN = out_c * out_h * out_w;
+  int out_sC = out_h * out_w;
+  int out_sH = out_w;
+  int out_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    ix = compute_positions(ix, in_w, padding_mode, align_corners);
+    iy = compute_positions(iy, in_h, padding_mode, align_corners);
+
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      auto inp_offset_NC = n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        *out_ptr_NCHW = static_cast<T>(0);
+        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+      }
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(round(ix));
+      int iy_nearest = static_cast<int>(round(iy));
+
+      auto inp_offset_NC = n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) {
+          *out_ptr_NCHW =
+              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
+    auto mode_s = ctx.Attr<std::string>("mode");
+    PaddingMode padding_mode;
+    Mode mode;
+    if (padding_mode_s == "border") {
+      padding_mode = PaddingMode::border;
+    } else if (padding_mode_s == "reflect") {
+      padding_mode = PaddingMode::reflect;
+    } else {
+      padding_mode = PaddingMode::zeros;
+    }
+
+    if (mode_s == "nearest") {
+      mode = Mode::nearest;
+    } else {
+      mode = Mode::bilinear;
+    }
+
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+    VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
+            << "; out_w: " << out_w;
+    auto* output = ctx.Output<Tensor>("Output");
+    auto* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    VLOG(3) << "set constant";
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        dev_ctx, output, static_cast<T>(0));
+    int count = static_cast<int>(n * out_h * out_w);
+
+    auto cu_stream = dev_ctx.stream();
+
+    int block = 512;
+    int grid_size = (count + block - 1) / block;
+    grid_sample_cuda_kernel<T><<<block, grid_size, 0, cu_stream>>>(
+        count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
+        grid->data<T>(), output_data, mode, padding_mode, align_corners);
+  }
+};
+
+template <typename T>
+__global__ void grid_sampler_cuda_backward_kernel(
+    const int nthreads, const T* grad_output, const T* input, const T* grid,
+    int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input,
+    T* grad_grid, const Mode mode, const PaddingMode padding_mode,
+    bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+
+  int gOut_sN = out_c * out_h * out_w;
+  int gOut_sC = out_h * out_w;
+  int gOut_sH = out_w;
+  int gOut_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    T gix_mult, giy_mult;
+    ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners,
+                                     &gix_mult);
+    iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners,
+                                     &giy_mult);
+
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0);
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      int inp_offset_NC = n * inp_sN;
+      for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC,
+               gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w,
+                   nw * gOut);
+        atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w,
+                   ne * gOut);
+        atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w,
+                   sw * gOut);
+        atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w,
+                   se * gOut);
+
+        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
+          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
+          gix -= nw_val * (iy_se - iy) * gOut;
+          giy -= nw_val * (ix_se - ix) * gOut;
+        }
+        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
+          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
+          gix += ne_val * (iy_sw - iy) * gOut;
+          giy -= ne_val * (ix - ix_sw) * gOut;
+        }
+        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
+          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
+          gix -= sw_val * (iy - iy_ne) * gOut;
+          giy += sw_val * (ix_ne - ix) * gOut;
+        }
+        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
+          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
+          gix += se_val * (iy - iy_nw) * gOut;
+          giy += se_val * (ix - ix_nw) * gOut;
+        }
+      }
+
+      T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+      gGrid_ptr_NHW[0] = gix_mult * gix;
+      gGrid_ptr_NHW[1] = giy_mult * giy;
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(::round(ix));
+      int iy_nearest = static_cast<int>(::round(iy));
+
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (int c = 0; c < out_c;
+           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h,
+                   in_w, grad_output[gOut_offset]);
+      }
+
+      T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+      gGrid_ptr_NHW[0] = static_cast<T>(0);
+      gGrid_ptr_NHW[1] = static_cast<T>(0);
+    }
+  }
+}
+
+template <typename T>
+class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
+    auto mode_s = ctx.Attr<std::string>("mode");
+
+    PaddingMode padding_mode;
+    Mode mode;
+    if (padding_mode_s == "border") {
+      padding_mode = PaddingMode::border;
+    } else if (padding_mode_s == "reflect") {
+      padding_mode = PaddingMode::reflect;
+    } else {
+      padding_mode = PaddingMode::zeros;
+    }
+
+    if (mode_s == "nearest") {
+      mode = Mode::nearest;
+    } else {
+      mode = Mode::bilinear;
+    }
+
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        ctx.template device_context<paddle::platform::CUDADeviceContext>(),
+        input_grad, static_cast<T>(0));
+    auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
+    grid_grad->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        ctx.template device_context<paddle::platform::CUDADeviceContext>(),
+        grid_grad, static_cast<T>(0));
+
+    int count = static_cast<int>(n * out_h * out_w);
+    auto cu_stream = dev_ctx.stream();
+    int block = 512;
+    int grid_size = (count + block - 1) / block;
+    grid_sampler_cuda_backward_kernel<T><<<block, grid_size, 0, cu_stream>>>(
+        count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
+        out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad->data<T>(),
+        mode, padding_mode, align_corners);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel<float>,
+                        ops::GridSampleOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(grid_sampler_grad,
+                        ops::GridSampleGradOpCUDAKernel<float>,
+                        ops::GridSampleGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index 08a6043eb07a6e44d46428ee195f6cb28c2ee77c..eda800e78faf5da2bb379b8101e4823c5bc2d2f8 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <iostream>
+#include <string>
+#include <utility>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather.h"
@@ -22,6 +25,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+enum class Mode {
+  bilinear,
+  nearest,
+};
+
+enum class PaddingMode { zeros, border, reflect };
+
 using Tensor = framework::Tensor;
 template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -39,64 +49,229 @@ static inline bool isInBound(T x, T y, T x_max, T y_max) {
 }
 
 template <typename T>
-static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
-                              const Tensor& grid, Tensor* x_w, Tensor* x_e,
-                              Tensor* y_n, Tensor* y_s, Tensor* d_w,
-                              Tensor* d_e, Tensor* d_n, Tensor* d_s) {
+static inline void unnormalize(const platform::CPUDeviceContext& ctx,
+                               Tensor* grid_slice,
+                               const int max_val,  // height-1 or width-1
+                               bool align_corners) {
   auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+
+  if (!align_corners) {
+    auto factor = static_cast<T>((max_val + 1) * 0.5);
+    grid_slice_t.device(place) =
+        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
+  } else {
+    auto factor = static_cast<T>(max_val * 0.5);
+    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
+  }
+}
+
+template <typename T>
+static inline void clip(const platform::CPUDeviceContext& ctx,
+                        Tensor* grid_slice,
+                        const int max_val,  // height-1 or width-1
+                        bool align_corners, std::string padding_mode) {
+  auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  if (padding_mode == "border") {
+    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                     .cwiseMin(static_cast<T>(max_val));
+  } else if (padding_mode == "reflect") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                       .cwiseMin(static_cast<T>(max_val));
+    }
+  }
+}
+
+template <typename T>
+static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
+                                const int max_val,  // height-1 or width-1
+                                bool align_corners, std::string padding_mode,
+                                Tensor* grid_slice, Tensor* grid_scale) {
+  auto& place = *ctx.eigen_device();
+  grid_scale->mutable_data<T>(grid_slice->dims(), ctx.GetPlace());
+
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  auto factor = static_cast<T>(max_val * 0.5);
+  if (!align_corners) {
+    factor = static_cast<T>((max_val + 1) * 0.5);
+  }
+  auto grid_scale_t = EigenTensor<T, 3>::From(*grid_scale).setConstant(factor);
+
+  if (padding_mode == "border") {
+    //    auto bounded_lo = grid_slice_t.cwiseMax(static_cast<T>(0));
+    auto res = grid_slice_t.cwiseMax(static_cast<T>(0))
+                   .cwiseMin(static_cast<T>(max_val));
+
+    auto in_bound = (res == grid_slice_t);
+    grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
+    grid_slice_t.device(place) = res;
+  } else if (padding_mode == "reflect") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto is_neg = (grid_slice_t < static_cast<T>(0));
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>());
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto is_neg = ((grid_slice_t + static_cast<T>(0.5)) < static_cast<T>(0));
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      auto reflected =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      auto clipped = reflected.cwiseMax(static_cast<T>(0))
+                         .cwiseMin(static_cast<T>(max_val));
+      auto in_bound = (clipped == reflected).template cast<T>();
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>()) *
+          in_bound;
+      grid_slice_t.device(place) = clipped;
+    }
+  }
+}
+
+template <typename T>
+static void calcGridLocations(const platform::CPUDeviceContext& ctx,
+                              const Tensor& grid, const int in_h,
+                              const int in_w, bool align_corners,
+                              std::string padding_mode, Tensor* grid_x,
+                              Tensor* grid_y) {
   const int n = grid.dims()[0];
-  const int h = grid.dims()[1];
-  const int w = grid.dims()[2];
-  const T x_max = static_cast<T>(w - 1);
-  const T y_max = static_cast<T>(h - 1);
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
 
   // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
-  Tensor grid_x, grid_y;
-  T* grid_x_data = grid_x.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  T* grid_y_data = grid_y.mutable_data<T>({n, h, w}, ctx.GetPlace());
+  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
   const T* grid_data = grid.data<T>();
-  for (int i = 0; i < n * h * w; i++) {
+  for (int i = 0; i < n * out_h * out_w; i++) {
     grid_x_data[i] = grid_data[2 * i];
     grid_y_data[i] = grid_data[(2 * i) + 1];
   }
 
-  Tensor ones;
-  ones.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0);
-  Tensor half_xmax;
-  Tensor half_ymax;
-  half_xmax.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto half_xmax_t =
-      EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max);
-  half_ymax.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto half_ymax_t =
-      EigenTensor<T, 3>::From(half_ymax).setConstant(0.5 * y_max);
-
-  // scale grid to [0, h-1/w-1]
-  auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
-  auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-  grid_x_t.device(place) = (grid_x_t + ones_t) * half_xmax_t;
-  grid_y_t.device(place) = (grid_y_t + ones_t) * half_ymax_t;
+  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
+  clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
+}
+
+template <typename T>
+static void calcGridLocationsWithGrad(const platform::CPUDeviceContext& ctx,
+                                      const Tensor& grid, const int in_h,
+                                      const int in_w, bool align_corners,
+                                      std::string padding_mode, Tensor* grid_x,
+                                      Tensor* grid_y, Tensor* grid_x_scale,
+                                      Tensor* grid_y_scale) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
 
+  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  clipWithMask<T>(ctx, in_w - 1, align_corners, padding_mode, grid_x,
+                  grid_x_scale);
+  clipWithMask<T>(ctx, in_h - 1, align_corners, padding_mode, grid_y,
+                  grid_y_scale);
+}
+
+template <typename T>
+static void getGridPointValue(const Tensor& input, Tensor* output,
+                              const Tensor& x, const Tensor& y) {
+  const int n = input.dims()[0];
+  const int c = input.dims()[1];
+  const int in_h = input.dims()[2];
+  const int in_w = input.dims()[3];
+  const int out_h = x.dims()[1];
+  const int out_w = x.dims()[2];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
+  auto input_t = EigenTensor<T, 4>::From(input);
+
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
+                      (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            output_t(i, j, k, l) =
+                input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
+                        static_cast<int>(round(x_t(i, k, l))));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void allNeigbors(const platform::CPUDeviceContext& ctx,
+                        const Tensor& input, Tensor* grid_x, Tensor* grid_y,
+                        Tensor* x_w, Tensor* x_e, Tensor* y_n,
+                        Tensor* y_s,  // positions
+                        Tensor* d_w, Tensor* d_e, Tensor* d_n,
+                        Tensor* d_s,  // distance
+                        Tensor* v_wn, Tensor* v_en, Tensor* v_ws,
+                        Tensor* v_es) {  // values
+  auto& place = *ctx.eigen_device();
+
+  const int c = input.dims()[1];
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
   // calculate coords of 4 corner points
-  x_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  x_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  y_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  y_s->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  x_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  x_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  y_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  y_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
   auto x_w_t = EigenTensor<T, 3>::From(*x_w);
   auto x_e_t = EigenTensor<T, 3>::From(*x_e);
   auto y_n_t = EigenTensor<T, 3>::From(*y_n);
   auto y_s_t = EigenTensor<T, 3>::From(*y_s);
+
+  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
+  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
+
   x_w_t.device(place) = grid_x_t.floor();
-  x_e_t.device(place) = x_w_t + ones_t;
+  x_e_t.device(place) = x_w_t + static_cast<T>(1);
   y_n_t.device(place) = grid_y_t.floor();
-  y_s_t.device(place) = y_n_t + ones_t;
+  y_s_t.device(place) = y_n_t + static_cast<T>(1);
 
   // calculate distances to 4 sides
-  d_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  d_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  d_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  d_s->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  d_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  d_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  d_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  d_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
   auto d_w_t = EigenTensor<T, 3>::From(*d_w);
   auto d_e_t = EigenTensor<T, 3>::From(*d_e);
   auto d_n_t = EigenTensor<T, 3>::From(*d_n);
@@ -105,28 +280,100 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
   d_e_t.device(place) = x_e_t - grid_x_t;
   d_n_t.device(place) = grid_y_t - y_n_t;
   d_s_t.device(place) = y_s_t - grid_y_t;
+
+  // calc 4 corner points value
+  v_wn->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  v_en->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  v_ws->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  v_es->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  getGridPointValue<T>(input, v_wn, *x_w, *y_n);
+  getGridPointValue<T>(input, v_en, *x_e, *y_n);
+  getGridPointValue<T>(input, v_ws, *x_w, *y_s);
+  getGridPointValue<T>(input, v_es, *x_e, *y_s);
 }
 
 template <typename T>
-static void GetGridPointValue(const Tensor& input, Tensor* output,
-                              const Tensor& x, const Tensor& y) {
-  const int n = input.dims()[0];
+static void bilinearInter(const platform::CPUDeviceContext& ctx,
+                          const Tensor& input, Tensor* grid_x, Tensor* grid_y,
+                          Tensor* out) {
+  auto& place = *ctx.eigen_device();
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
   const int c = input.dims()[1];
-  const int h = input.dims()[2];
-  const int w = input.dims()[3];
+
+  Tensor x_w, x_e, y_n, y_s;
+  Tensor d_w, d_e, d_n, d_s;
+  Tensor v_wn, v_en, v_ws, v_es;
+
+  allNeigbors<T>(ctx, input, grid_x, grid_y, &x_w, &x_e, &y_n, &y_s, &d_w, &d_e,
+                 &d_n, &d_s, &v_wn, &v_en, &v_ws, &v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto d_w_scaled_t =
+      d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_e_scaled_t =
+      d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_n_scaled_t =
+      d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_s_scaled_t =
+      d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+  auto output_t = EigenTensor<T, 4>::From(*out);
+  // bilinear interpolaetion by 4 corner points
+  output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
+                           v_en_t * d_w_scaled_t * d_s_scaled_t +
+                           v_ws_t * d_e_scaled_t * d_n_scaled_t +
+                           v_es_t * d_w_scaled_t * d_n_scaled_t;
+}
+
+template <typename T>
+static void nearestInter(const platform::CPUDeviceContext& ctx,
+                         const Tensor& input, Tensor* grid_x, Tensor* grid_y,
+                         Tensor* out) {
+  auto& place = *ctx.eigen_device();
+
+  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
+  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
+  grid_x_t = grid_x_t.round();
+  grid_y_t = grid_y_t.round();
+  getGridPointValue<T>(input, out, *grid_x, *grid_y);
+}
+
+template <typename T>
+static void gatherOutputGradToInputGrad(const Tensor& output_grad,
+                                        Tensor* input_grad, const Tensor& x,
+                                        const Tensor& y, const Tensor& d1,
+                                        const Tensor& d2) {
+  const int n = output_grad.dims()[0];
+  const int c = output_grad.dims()[1];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
   auto x_t = EigenTensor<T, 3>::From(x);
   auto y_t = EigenTensor<T, 3>::From(y);
-  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
-  auto input_t = EigenTensor<T, 4>::From(input);
+  auto d1_t = EigenTensor<T, 3>::From(d1);
+  auto d2_t = EigenTensor<T, 3>::From(d2);
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
 
   for (int i = 0; i < n; i++) {
-    for (int k = 0; k < h; k++) {
-      for (int l = 0; l < w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(w - 1), (T)(h - 1))) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
+                      (T)(in_h - 1))) {
           for (int j = 0; j < c; j++) {
-            output_t(i, j, k, l) =
-                input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                        static_cast<int>(round(x_t(i, k, l))));
+            input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
+                         static_cast<int>(round(x_t(i, k, l)))) +=
+                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
           }
         }
       }
@@ -135,29 +382,28 @@ static void GetGridPointValue(const Tensor& input, Tensor* output,
 }
 
 template <typename T>
-static void GatherOutputGradToInputGrad(const Tensor& output_grad,
+static void gatherOutputGradToInputGrad(const Tensor& output_grad,
                                         Tensor* input_grad, const Tensor& x,
-                                        const Tensor& y, const Tensor& d1,
-                                        const Tensor& d2) {
+                                        const Tensor& y) {
   const int n = output_grad.dims()[0];
   const int c = output_grad.dims()[1];
-  const int h = output_grad.dims()[2];
-  const int w = output_grad.dims()[3];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
   auto x_t = EigenTensor<T, 3>::From(x);
   auto y_t = EigenTensor<T, 3>::From(y);
-  auto d1_t = EigenTensor<T, 3>::From(d1);
-  auto d2_t = EigenTensor<T, 3>::From(d2);
   auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
   auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-
   for (int i = 0; i < n; i++) {
-    for (int k = 0; k < h; k++) {
-      for (int l = 0; l < w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(w - 1), (T)(h - 1))) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
+                      (T)(in_h - 1))) {
           for (int j = 0; j < c; j++) {
             input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
                          static_cast<int>(round(x_t(i, k, l)))) +=
-                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
+                output_grad_t(i, j, k, l);
           }
         }
       }
@@ -165,65 +411,126 @@ static void GatherOutputGradToInputGrad(const Tensor& output_grad,
   }
 }
 
+template <typename T>
+static void gatherBilinearGrad(const platform::CPUDeviceContext& ctx,
+                               const Tensor& input, const Tensor& output_grad,
+                               Tensor* grid_x, Tensor* grid_y,
+                               Tensor* grid_x_scale, Tensor* grid_y_scale,
+                               Tensor* input_grad, Tensor* grid_grad) {
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  const int c = input.dims()[1];
+
+  Tensor x_w, x_e, y_n, y_s;
+  Tensor d_w, d_e, d_n, d_s;
+  Tensor v_wn, v_en, v_ws, v_es;
+
+  allNeigbors<T>(ctx, input,
+                 grid_x,  // grid_x
+                 grid_y,  // grid_y
+                 &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s, &v_wn, &v_en,
+                 &v_ws, &v_es);
+
+  // gather output grad value to input grad by corner point coords and weight
+  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_n, d_e, d_s);
+  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_s, d_e, d_n);
+  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_n, d_w, d_s);
+  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_s, d_w, d_n);
+
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  Tensor grid_grad_x, grid_grad_y;
+  grid_grad_x.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  grid_grad_y.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  auto grid_grad_x_t =
+      EigenTensor<T, 3>::From(grid_grad_x).setConstant(static_cast<T>(0.0));
+  auto grid_grad_y_t =
+      EigenTensor<T, 3>::From(grid_grad_y).setConstant(static_cast<T>(0.0));
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < c; j++) {
+      for (int k = 0; k < out_h; k++) {
+        for (int l = 0; l < out_w; l++) {
+          grid_grad_x_t(i, k, l) +=
+              ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
+               (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
+              output_grad_t(i, j, k, l);
+          grid_grad_y_t(i, k, l) +=
+              ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
+               (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
+              output_grad_t(i, j, k, l);
+        }
+      }
+    }
+  }
+
+  //  const T x_max = static_cast<T>(in_w - 1);
+  //  const T y_max = static_cast<T>(in_h - 1);
+
+  auto grid_x_scale_t = EigenTensor<T, 3>::From(*grid_x_scale);
+  auto grid_y_scale_t = EigenTensor<T, 3>::From(*grid_y_scale);
+  grid_grad_x_t = grid_grad_x_t * grid_x_scale_t;
+  grid_grad_y_t = grid_grad_y_t * grid_y_scale_t;
+
+  // gather grid_grad [x, y] in 3rd Dim
+  T* grid_grad_data = grid_grad->data<T>();
+  T* grid_grad_x_data = grid_grad_x.data<T>();
+  T* grid_grad_y_data = grid_grad_y.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_grad_data[2 * i] = grid_grad_x_data[i];
+    grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
+  }
+}
+
 template <typename DeviceContext, typename T>
 class GridSampleOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode = ctx.Attr<std::string>("padding_mode");
+    auto mode = ctx.Attr<std::string>("mode");
+
     auto* input = ctx.Input<Tensor>("X");
     auto* grid = ctx.Input<Tensor>("Grid");
 
-    const int n = input->dims()[0];
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
     const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    // calc locations and distances of 4 corner points
-    Tensor x_w, x_e, y_n, y_s;
-    Tensor d_w, d_e, d_n, d_s;
-    CalcGridLocations<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, &x_w,
-        &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s);
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
 
     auto* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
     math::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), output,
         static_cast<T>(0));
 
-    // calc 4 corner points value
-    Tensor v_wn, v_en, v_ws, v_es;
-    v_wn.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_en.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_ws.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_es.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    GetGridPointValue<T>(*input, &v_wn, x_w, y_n);
-    GetGridPointValue<T>(*input, &v_en, x_e, y_n);
-    GetGridPointValue<T>(*input, &v_ws, x_w, y_s);
-    GetGridPointValue<T>(*input, &v_es, x_e, y_s);
-
-    auto d_w_t = EigenTensor<T, 3>::From(d_w);
-    auto d_e_t = EigenTensor<T, 3>::From(d_e);
-    auto d_n_t = EigenTensor<T, 3>::From(d_n);
-    auto d_s_t = EigenTensor<T, 3>::From(d_s);
-    auto d_w_scaled_t =
-        d_w_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto d_e_scaled_t =
-        d_e_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto d_n_scaled_t =
-        d_n_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto d_s_scaled_t =
-        d_s_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-    auto v_en_t = EigenTensor<T, 4>::From(v_en);
-    auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-    auto v_es_t = EigenTensor<T, 4>::From(v_es);
-    auto output_t = EigenTensor<T, 4>::From(*output);
-    // bilinear interpolaetion by 4 corner points
-    output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
-                             v_en_t * d_w_scaled_t * d_s_scaled_t +
-                             v_ws_t * d_e_scaled_t * d_n_scaled_t +
-                             v_es_t * d_w_scaled_t * d_n_scaled_t;
+    Tensor grid_x, grid_y;
+    calcGridLocations<T>(
+        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
+        in_w, align_corners, padding_mode, &grid_x, &grid_y);
+    if (mode == "bilinear") {
+      bilinearInter<T>(
+          ctx.template device_context<platform::CPUDeviceContext>(), *input,
+          &grid_x, &grid_y, output);
+    } else if (mode == "nearest") {
+      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+      grid_x_t = grid_x_t.round();
+      grid_y_t = grid_y_t.round();
+      getGridPointValue<T>(*input, output, grid_x, grid_y);
+    }
   }
 };
 
@@ -231,97 +538,48 @@ template <typename DeviceContext, typename T>
 class GridSampleGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode = ctx.Attr<std::string>("padding_mode");
+    auto mode = ctx.Attr<std::string>("mode");
+
     auto* input = ctx.Input<Tensor>("X");
     auto* grid = ctx.Input<Tensor>("Grid");
     auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
 
-    const int n = input->dims()[0];
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
     const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
 
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
     math::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), input_grad,
         static_cast<T>(0));
     auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-    grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
+    grid_grad->mutable_data<T>({n, out_h, out_w, 2}, ctx.GetPlace());
     math::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), grid_grad,
         static_cast<T>(0));
-
-    Tensor x_w, x_e, y_n, y_s;
-    Tensor d_w, d_e, d_n, d_s;
-    CalcGridLocations<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, &x_w,
-        &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s);
-
-    // gather output grad value to input grad by corner point coords and weight
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_w, y_n, d_e,
-                                   d_s);
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_w, y_s, d_e,
-                                   d_n);
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_e, y_n, d_w,
-                                   d_s);
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_e, y_s, d_w,
-                                   d_n);
-
-    // calc 4 corner points value
-    Tensor v_wn, v_en, v_ws, v_es;
-    v_wn.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_en.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_ws.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_es.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    GetGridPointValue<T>(*input, &v_wn, x_w, y_n);
-    GetGridPointValue<T>(*input, &v_en, x_e, y_n);
-    GetGridPointValue<T>(*input, &v_ws, x_w, y_s);
-    GetGridPointValue<T>(*input, &v_es, x_e, y_s);
-    auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-    auto v_en_t = EigenTensor<T, 4>::From(v_en);
-    auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-    auto v_es_t = EigenTensor<T, 4>::From(v_es);
-
-    auto d_w_t = EigenTensor<T, 3>::From(d_w);
-    auto d_e_t = EigenTensor<T, 3>::From(d_e);
-    auto d_n_t = EigenTensor<T, 3>::From(d_n);
-    auto d_s_t = EigenTensor<T, 3>::From(d_s);
-
-    auto output_grad_t = EigenTensor<T, 4>::From(*output_grad);
-
-    Tensor grid_grad_x, grid_grad_y;
-    grid_grad_x.mutable_data<T>({n, h, w}, ctx.GetPlace());
-    grid_grad_y.mutable_data<T>({n, h, w}, ctx.GetPlace());
-    auto grid_grad_x_t = EigenTensor<T, 3>::From(grid_grad_x).setConstant(0.0);
-    auto grid_grad_y_t = EigenTensor<T, 3>::From(grid_grad_y).setConstant(0.0);
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < c; j++) {
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            grid_grad_x_t(i, k, l) +=
-                ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-            grid_grad_y_t(i, k, l) +=
-                ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-          }
-        }
-      }
-    }
-    const T x_max = static_cast<T>(w - 1);
-    const T y_max = static_cast<T>(h - 1);
-    grid_grad_x_t = grid_grad_x_t * (x_max / (T)2);
-    grid_grad_y_t = grid_grad_y_t * (y_max / (T)2);
-
-    // gather grid_grad [x, y] in 3rd Dim
-    T* grid_grad_data = grid_grad->data<T>();
-    T* grid_grad_x_data = grid_grad_x.data<T>();
-    T* grid_grad_y_data = grid_grad_y.data<T>();
-    for (int i = 0; i < n * h * w; i++) {
-      grid_grad_data[2 * i] = grid_grad_x_data[i];
-      grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
+    Tensor grid_x, grid_y;
+    Tensor grid_x_scale, grid_y_scale;
+    calcGridLocationsWithGrad<T>(
+        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
+        in_w, align_corners, padding_mode, &grid_x, &grid_y, &grid_x_scale,
+        &grid_y_scale);
+    if (mode == "bilinear") {
+      gatherBilinearGrad<T>(ctx.template device_context<DeviceContext>(),
+                            *input, *output_grad, &grid_x, &grid_y,
+                            &grid_x_scale, &grid_y_scale, input_grad,
+                            grid_grad);
+    } else {
+      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+      grid_x_t = grid_x_t.round();
+      grid_y_t = grid_y_t.round();
+      gatherOutputGradToInputGrad<T>(*output_grad, input_grad, grid_x, grid_y);
     }
   }
 };
diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu
index 09c743c4275169ba8c53ccbd428100b2fc4483d6..4ce6856a7eade1b314d8aef1d039424ad42e07cf 100644
--- a/paddle/fluid/operators/huber_loss_op.cu
+++ b/paddle/fluid/operators/huber_loss_op.cu
@@ -16,7 +16,9 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     huber_loss,
-    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12733a0d9f1689a020f77d23cc31b0d19b412746
--- /dev/null
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -0,0 +1,695 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/interpolate_v2_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
+  auto dim_x = ctx->GetInputDim("X");
+  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
+
+  PADDLE_ENFORCE_EQ("linear", interp_method,
+                    platform::errors::InvalidArgument(
+                        "Interpolation method can only be \"linear\" when"
+                        "Input(X) dimension is 3, but got method = %s .",
+                        interp_method));
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+
+  if (ctx->HasInputs("SizeTensor")) {
+    // top prority size
+    auto inputs_name = ctx->Inputs("SizeTensor");
+    PADDLE_ENFORCE_EQ(
+        inputs_name.size(), 1,
+        platform::errors::InvalidArgument(
+            "Input(SizeTensor)'size of Op(interpolate) must be 1. "
+            "Attr(out_shape)'s length must be 1 for 3-D input tensor, but got "
+            "size = %d .",
+            inputs_name.size()));
+    int out_w = ctx->Attrs().Get<int>("out_w");
+    framework::DDim dim_out;
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {dim_x[0], dim_x[1], out_w};
+    } else {
+      dim_out = {dim_x[0], out_w, dim_x[2]};
+    }
+    ctx->SetOutputDim("Out", dim_out);
+
+    return;
+  }
+
+  int out_w;
+  if (ctx->HasInput("Scale")) {
+    auto scale_tensor = ctx->GetInputDim("Scale");
+    PADDLE_ENFORCE_EQ(
+        scale_tensor.size(), 1,
+        platform::errors::InvalidArgument(
+            "Scale's dimension size must be 1, but got dimension = %d .",
+            scale_tensor.size()));
+    PADDLE_ENFORCE_EQ(
+        scale_tensor[0], 1,
+        platform::errors::InvalidArgument(
+            "Scale's shape must be 1, but got shape = %d .", scale_tensor[0]));
+    // out_w = -1;
+  } else {
+    auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
+    if (scale.size() > 0) {
+      float scale_w = -1;
+      scale_w = scale[0];
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+      if (scale_w > 0.) {
+        // round down
+        out_w = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[2] * scale_w)
+                     : static_cast<int>(dim_x[1] * scale_w));
+        // protect when input shape is -1
+        out_w = out_w > 0 ? out_w : -1;
+      }
+    } else {
+      out_w = ctx->Attrs().Get<int>("out_w");
+    }
+  }
+
+  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
+    auto out_size_dim = ctx->GetInputDim("OutSize");
+    PADDLE_ENFORCE_EQ(
+        out_size_dim.size(), 1,
+        platform::errors::InvalidArgument(
+            "OutSize's dimension size must be 1, but got dimention = %d .",
+            out_size_dim.size()));
+    PADDLE_ENFORCE_EQ(out_size_dim[0], 1, platform::errors::InvalidArgument(
+                                              "OutSize's dim[0] must be 1"));
+    ctx->ShareLoD("X", "Out");
+    return;
+  }
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {dim_x[0], dim_x[1], out_w};
+  } else {
+    dim_out = {dim_x[0], out_w, dim_x[2]};
+  }
+  ctx->SetOutputDim("Out", dim_out);
+}
+
+static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
+  auto dim_x = ctx->GetInputDim("X");
+  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
+
+  PADDLE_ENFORCE(
+      "bilinear" == interp_method || "nearest" == interp_method ||
+          "bicubic" == interp_method,
+      "Interpolation method can only be \"bilinear\" or \"nearest\" when "
+      "Input(X) dimension is 4, but got method = %s .",
+      interp_method);
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+
+  if (ctx->HasInputs("SizeTensor")) {
+    // top prority size
+    auto inputs_name = ctx->Inputs("SizeTensor");
+    PADDLE_ENFORCE_EQ(
+        inputs_name.size(), 2,
+        platform::errors::InvalidArgument(
+            "Input(SizeTensor)'size of Op(interpolate) must be 2. "
+            "Attr(out_shape)'s length must be 2 for 4-D input "
+            "tensor, but got size = %d .",
+            inputs_name.size()));
+    int out_h = ctx->Attrs().Get<int>("out_h");
+    int out_w = ctx->Attrs().Get<int>("out_w");
+    framework::DDim dim_out;
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {dim_x[0], dim_x[1], out_h, out_w};
+    } else {
+      dim_out = {dim_x[0], out_h, out_w, dim_x[3]};
+    }
+    ctx->SetOutputDim("Out", dim_out);
+
+    return;
+  }
+
+  int out_h, out_w;
+  if (ctx->HasInput("Scale")) {
+    auto scale_tensor = ctx->GetInputDim("Scale");
+    PADDLE_ENFORCE_EQ(
+        scale_tensor.size(), 1,
+        platform::errors::InvalidArgument(
+            "Scale's dimension size must be 1, but got dimension = %d .",
+            scale_tensor.size()));
+    PADDLE_ENFORCE_EQ(scale_tensor[0] == 2 || scale_tensor[0] == 1, true,
+                      platform::errors::InvalidArgument(
+                          "Scale's shape must be 2 or 1, but got shape = %d .",
+                          scale_tensor[0]));
+    // out_h = -1;
+    // out_w = -1;
+  } else {
+    auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
+    if (scale.size() > 0) {
+      float scale_h = -1;
+      float scale_w = -1;
+      scale_h = scale[0];
+      scale_w = scale[1];
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+      if (scale_h > 0. && scale_w > 0.) {
+        // round down
+        out_h = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[2] * scale_h)
+                     : static_cast<int>(dim_x[1] * scale_h));
+        out_w = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[3] * scale_w)
+                     : static_cast<int>(dim_x[2] * scale_w));
+        // protect when input shape is -1
+        out_h = out_h > 0 ? out_h : -1;
+        out_w = out_w > 0 ? out_w : -1;
+      }
+    } else {
+      out_h = ctx->Attrs().Get<int>("out_h");
+      out_w = ctx->Attrs().Get<int>("out_w");
+    }
+  }
+
+  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
+    auto out_size_dim = ctx->GetInputDim("OutSize");
+    PADDLE_ENFORCE_EQ(
+        out_size_dim.size(), 1,
+        platform::errors::InvalidArgument(
+            "OutSize's dimension size must be 1, but got dimension = %d .",
+            out_size_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        out_size_dim[0], 2,
+        platform::errors::InvalidArgument(
+            "OutSize's dim[0] must be 2, but got dimention = %d .",
+            out_size_dim[0]));
+    ctx->ShareLoD("X", "Out");
+    return;
+  }
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {dim_x[0], dim_x[1], out_h, out_w};
+  } else {
+    dim_out = {dim_x[0], out_h, out_w, dim_x[3]};
+  }
+  ctx->SetOutputDim("Out", dim_out);
+}
+
+static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
+  auto dim_x = ctx->GetInputDim("X");
+  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
+
+  PADDLE_ENFORCE_EQ(
+      "trilinear", interp_method,
+      platform::errors::InvalidArgument(
+          "Interpolation method can only be \"trilinear\" when Input(X) "
+          "dimension is 5, but got method = %s .",
+          interp_method));
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+
+  if (ctx->HasInputs("SizeTensor")) {
+    // top prority size
+    auto inputs_name = ctx->Inputs("SizeTensor");
+    PADDLE_ENFORCE_EQ(
+        inputs_name.size(), 3,
+        platform::errors::InvalidArgument(
+            "Input(SizeTensor)'s size of Op(interpolate) must be 3. "
+            "Attr(out_shape)'s length must be 3 for 5-D input "
+            "tensor, but got size = %d .",
+            inputs_name.size()));
+    int out_d = ctx->Attrs().Get<int>("out_d");
+    int out_h = ctx->Attrs().Get<int>("out_h");
+    int out_w = ctx->Attrs().Get<int>("out_w");
+    framework::DDim dim_out;
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w};
+    } else {
+      dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]};
+    }
+    ctx->SetOutputDim("Out", dim_out);
+
+    return;
+  }
+
+  int out_d, out_h, out_w;
+  if (ctx->HasInput("Scale")) {
+    auto scale_tensor = ctx->GetInputDim("Scale");
+    PADDLE_ENFORCE_EQ(
+        scale_tensor.size(), 1,
+        platform::errors::InvalidArgument(
+            "Scale's dimension size must be 1, but got size = %d .",
+            scale_tensor.size()));
+    PADDLE_ENFORCE_EQ(scale_tensor[0] == 3 || scale_tensor[0] == 1, true,
+                      platform::errors::InvalidArgument(
+                          "Scale's shape must be 3 or 1, but got shape = %d .",
+                          scale_tensor[0]));
+    // out_d = -1;
+    // out_h = -1;
+    // out_w = -1;
+  } else {
+    auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
+    if (scale.size() > 0) {
+      float scale_d = -1;
+      float scale_h = -1;
+      float scale_w = -1;
+      scale_d = scale[0];
+      scale_h = scale[1];
+      scale_w = scale[2];
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+      if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
+        // round down
+        out_d = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[2] * scale_d)
+                     : static_cast<int>(dim_x[1] * scale_d));
+        out_h = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[3] * scale_h)
+                     : static_cast<int>(dim_x[2] * scale_h));
+        out_w = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[4] * scale_w)
+                     : static_cast<int>(dim_x[3] * scale_w));
+        // protect when input shape is -1
+        out_d = out_d > 0 ? out_d : -1;
+        out_h = out_h > 0 ? out_h : -1;
+        out_w = out_w > 0 ? out_w : -1;
+      }
+    } else {
+      out_d = ctx->Attrs().Get<int>("out_d");
+      out_h = ctx->Attrs().Get<int>("out_h");
+      out_w = ctx->Attrs().Get<int>("out_w");
+    }
+  }
+
+  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
+    auto out_size_dim = ctx->GetInputDim("OutSize");
+    PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
+                      "OutSize's dimension size must be 1, but got size =%d .",
+                      out_size_dim.size());
+    PADDLE_ENFORCE_EQ(out_size_dim[0], 3,
+                      "OutSize's dim[0] must be 3, but got size = %d .",
+                      out_size_dim[0]);
+    ctx->ShareLoD("X", "Out");
+    return;
+  }
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w};
+  } else {
+    dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]};
+  }
+  ctx->SetOutputDim("Out", dim_out);
+}
+
+class InterpolateV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of InterpolateV2Op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of InterpolationOp should not be null.");
+
+    auto dim_x = ctx->GetInputDim("X");  // NCHW format
+    PADDLE_ENFORCE(
+        dim_x.size() == 3 || dim_x.size() == 4 || dim_x.size() == 5,
+        platform::errors::Unimplemented(
+            "Input(X) dimension must be 3, 4 or 5, but got dimension = %d .",
+            dim_x.size()));
+
+    if (dim_x.size() == 3) {
+      // shape check for 1D interpolate for input tensor shape NCHW
+      Interpolate1DInferShapeCheck(ctx);
+    } else if (dim_x.size() == 4) {
+      // shape check for 2D interpolate for input tensor shape NCHW
+      Interpolate2DInferShapeCheck(ctx);
+    } else {  // dim_x.size() == 5
+      // shape check for 3D interpolate for input tensor shape NCDHW
+      Interpolate3DInferShapeCheck(ctx);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "SizeTensor" || var_name == "Scale") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of interpolate operator, "
+             "This is a 4-D tensor with shape of [N, C, H, W] or a "
+             "5-D tensor with shape of [N, C, D, H, W].");
+    AddInput("OutSize",
+             "This is a 1-D tensor with two numbers to specify output size. "
+             "It should be [output_height, output_width] when input is a 4-D "
+             "tensor and should be [output_depth, output_height, output_width] "
+             "when input is a 5-D tensor. It has a higher priority than "
+             "the attr(out_d), attr(out_h), attr(out_w) and attr(scale).")
+        .AsDispensable();
+    AddInput("SizeTensor",
+             "(vector<Tensor<int32>>, optional). If provided, interpolate will "
+             "use this. The shape of the tensor in vector MUST BE [1]. "
+             "It has the highest priority compare with Input(OutSize) and "
+             "attr(out_d), attr(out_h), attr(out_w) and attr(scale).")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput("Scale",
+             "This is a 1-D tensor with one number to specify output scale. "
+             "It has the higher priority compare with attr(scale).")
+        .AsDispensable();
+    AddOutput("Out",
+              "The output tensor of interpolate operator, "
+              "This is a tensor in same rank with Input(X).");
+
+    AddAttr<std::string>(
+        "data_layout",
+        "(string, default NCHW) Only used in "
+        "an optional string from: \"NHWC\", \"NCHW\". "
+        "Specify that the data format of the input and output data is "
+        "channel_first or channel_last.")
+        .SetDefault("NCHW");
+    AddAttr<int>("out_d", "output depth of interpolate op.").SetDefault(0);
+    AddAttr<int>("out_h", "output height of interpolate op.").SetDefault(0);
+    AddAttr<int>("out_w", "output width of interpolate op.").SetDefault(0);
+    AddAttr<std::vector<float>>("scale", "scale_d factor of interpolate op.")
+        .SetDefault(std::vector<float>{});
+    AddAttr<std::string>("interp_method",
+                         "(string, default \"bilinear\"), interpolation "
+                         "method, can be \"linear\" for linear interpolation"
+                         ",\"bilinear\" for "
+                         "bilinear interpolation, \"trilinear\" for trilinear "
+                         "interpolation and \"nearest\" for nearest "
+                         "neighbor interpolation, and \"bicubic\" for bicubic"
+                         "interpolation.")
+        .SetDefault("bilinear");
+    AddAttr<bool>(
+        "align_corners",
+        "an optional bool. Defaults to True. "
+        "If True, the centers of 4 corner pixels of the input and output "
+        "tensors are aligned, preserving the values at the corner pixels, "
+        "If False, are not aligned")
+        .SetDefault(true);
+    AddAttr<int>("align_mode",
+                 "(int, default \'1\'), optional for bilinear interpolation, "
+                 "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , "
+                 "can be \'1\' for src_idx = scale*dst_index .")
+        .SetDefault(1);
+    AddComment(R"DOC(
+          This operator samples input X to given output shape by using specified
+          interpolation method, the interpolation methods can be \"nearest\"
+          for nearest neighbor interpolation and \"bilinear\" for bilinear 
+          interpolation and \"linear\" for linear interpolation..
+
+          Nearest neighbor interpolation is to perform nearest neighbor interpolation
+          in both the 3rd dimension(in height direction) and the 4th dimension(in width 
+          direction) on input tensor.
+           
+          Linear interpolation is the method of using a line connecting two known quantities 
+          to determine the value of an unknown quantity between the two known quantities. 
+          
+          Bilinear interpolation is an extension of linear interpolation for 
+          interpolating functions of two variables (e.g. H-direction and 
+          W-direction in this op) on a rectilinear 2D grid. The key idea is 
+          to perform linear interpolation first in one direction, and then 
+          again in the other direction.
+
+          Trilinear interpolation is an extension of linear interpolation for 
+          interpolating functions of three variables (e.g. D-direction, 
+          H-direction and W-direction in this op) on a rectilinear 3D grid. 
+          The linear interpolation is performed on three directions.
+
+          Bicubic interpolation is an extension of cubic interpolation for interpolating
+          data points on a two-dimensional regular grid. The interpolated surface is
+          smoother than corresponding surfaces obtained by bilinear interpolation or
+          nearest-neighbor interpolation.
+
+          Align_corners and align_mode are optional parameters,the calculation method 
+          of interpolation can be selected by them.
+          
+          Example:
+
+          For scale:
+          
+            if align_corners = True and out_{size}>1 :
+
+              scale_{factor} = (in_{size}-1.0)/(out_{size}-1.0)
+            
+            else:
+              
+              scale_{factor} = float(in_{size}/out_{size})
+            
+          
+          Nearest neighbor interpolation:
+          
+          if:
+              align_corners = False
+
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
+              W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
+
+          else:
+              align_corners = True
+
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = round(H_{in} * scale_{factor})
+              W_out = round(W_{in} * scale_{factor})
+
+          Bilinear interpolation:
+
+          if:
+              align_corners = False , align_mode = 0
+              
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+
+
+          else:
+           
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+
+          Trilinear interpolation:
+
+          if:
+              align_corners = False , align_mode = 0
+              
+              input : (N,C,D_in,H_in,W_in)
+              output: (N,C,D_out,H_out,W_out) where:
+              
+              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+
+
+          else:
+           
+              input : (N,C,D_in,H_in,W_in)
+              output: (N,C,D_out,H_out,W_out) where:
+
+              D_out = D_{in} * scale_{factor}
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+
+          Bicubic interpolation:
+
+          if:
+              align_corners = False
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+          else:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+
+          For details of nearest neighbor interpolation, please refer to Wikipedia: 
+          https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
+
+          For details of bilinear interpolation, please refer to Wikipedia: 
+          https://en.wikipedia.org/wiki/Bilinear_interp_v2olation
+
+          For details of trilinear interpolation, please refer to Wikipedia: 
+          https://en.wikipedia.org/wiki/Trilinear_interp_v2olation
+
+          For details of bicubic interpolation, please refer to Wikipedia:
+          https://en.wikipedia.org/wiki/Bicubic_interpolation
+         )DOC");
+  }
+};
+
+class InterpolateV2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto dim_x = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "SizeTensor" || var_name == "Scale") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+template <typename T>
+class InterpolateV2GradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    if (this->HasInput("SizeTensor") > 0) {
+      op->SetInput("SizeTensor", this->Input("SizeTensor"));
+    }
+    if (this->HasInput("OutSize") > 0) {
+      op->SetInput("OutSize", this->Input("OutSize"));
+    }
+    if (this->HasInput("Scale") > 0) {
+      op->SetInput("Scale", this->Input("Scale"));
+    }
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(InterpolateV2GradNoNeedBufferVarsInferer,
+                                    "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(bilinear_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(bilinear_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OPERATOR(nearest_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(nearest_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OPERATOR(trilinear_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(trilinear_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OPERATOR(bicubic_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(bicubic_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OP_CPU_KERNEL(bilinear_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(bilinear_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
+REGISTER_OP_CPU_KERNEL(nearest_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(nearest_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
+REGISTER_OP_CPU_KERNEL(trilinear_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(trilinear_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
+REGISTER_OPERATOR(linear_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(linear_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OP_CPU_KERNEL(linear_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(linear_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
+REGISTER_OP_CPU_KERNEL(bicubic_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>);
+REGISTER_OP_CPU_KERNEL(bicubic_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6cb8104638dea458743374014e7bef35df2dbfcc
--- /dev/null
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -0,0 +1,1578 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <algorithm>
+#include <string>
+#include "paddle/fluid/operators/interpolate_v2_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+__global__ void KeNearestNeighborInterpFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idy = (align_corners)
+                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    int in_img_idx = (align_corners)
+                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+
+    if (data_layout == DataLayout::kNCHW) {
+      out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
+                    in_img_idy * in_img_w + in_img_idx];
+    } else {
+      out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                    in_img_idx * num_channels + channel_id];
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeNearestNeighborInterpBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
+    const size_t input_w, const T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idy = (align_corners)
+                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    int in_img_idx = (align_corners)
+                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+
+    T* in_pos;
+    if (data_layout == DataLayout::kNCHW) {
+      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                   in_img_idy * in_img_w + in_img_idx];
+    } else {
+      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                   in_img_idx * num_channels + channel_id];
+    }
+    const T out_pos = out[out_id_h * output_w + out_id_w];
+    platform::CudaAtomicAdd(in_pos, out_pos);
+  }
+}
+
+template <typename T>
+__global__ void KeLinearInterpFw(const T* in, const size_t in_img_w,
+                                 const size_t input_w, T* out,
+                                 const size_t out_img_w, const size_t output_h,
+                                 const size_t output_w,
+                                 const size_t num_channels, const float ratio_w,
+                                 const bool align_corners, const int align_mode,
+                                 const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;  // w
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;  // w_id
+
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    if (data_layout == DataLayout::kNCHW) {
+      const T* in_pos =
+          &in[out_id_h * out_id_w + channel_id * in_img_size + in_img_idx];
+      // linear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          w2lambda * in_pos[0] + w1lambda * in_pos[w_id];
+
+    } else {
+      const T* in_pos =
+          &in[out_id_h * input_w + in_img_idx * num_channels + channel_id];
+      // linear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels];
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeLinearInterpBw(T* in, const size_t in_img_w,
+                                 const size_t input_w, const T* out,
+                                 const size_t out_img_w, const size_t output_h,
+                                 const size_t output_w,
+                                 const size_t num_channels, const T ratio_w,
+                                 const bool align_corners, const int align_mode,
+                                 const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5
+                                : ratio_w * out_img_idx;
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;  // w
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;  // w_id
+
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    T* in_pos;
+    if (data_layout == DataLayout::kNCHW) {
+      in_pos = &in[out_id_h * input_w + channel_id * in_img_size + in_img_idx];
+    } else {
+      in_pos = &in[out_id_h * input_w + in_img_idx * num_channels + channel_id];
+    }
+    const T* out_pos = &out[out_id_w];
+
+    if (data_layout == DataLayout::kNCHW) {
+      platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
+    } else {
+      platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
+                              w1lambda * out_pos[0]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeBilinearInterpFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idy = align_flag
+                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
+    src_h = (src_h > 0) ? src_h : 0;
+    T h1lambda =
+        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    if (data_layout == DataLayout::kNCHW) {
+      const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                            in_img_idy * in_img_w + in_img_idx];
+
+      // bilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) +
+          h1lambda * (w2lambda * in_pos[h_id * in_img_w] +
+                      w1lambda * in_pos[h_id * in_img_w + w_id]);
+    } else {
+      const T* in_pos =
+          &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+              in_img_idx * num_channels + channel_id];
+
+      // bilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          h2lambda *
+              (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) +
+          h1lambda * (w2lambda * in_pos[h_id * in_img_w * num_channels] +
+                      w1lambda * in_pos[h_id * in_img_w * num_channels +
+                                        w_id * num_channels]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeBilinearInterpBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
+    const size_t input_w, const T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const T ratio_h, const T ratio_w,
+    const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idy = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5
+                                : ratio_h * out_img_idy;
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
+    src_h = (src_h > 0) ? src_h : 0;
+    T h1lambda =
+        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5
+                                : ratio_w * out_img_idx;
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    T* in_pos;
+    if (data_layout == DataLayout::kNCHW) {
+      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                   in_img_idy * in_img_w + in_img_idx];
+    } else {
+      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                   in_img_idx * num_channels + channel_id];
+    }
+
+    const T* out_pos = &out[out_id_h * output_w + out_id_w];
+
+    if (data_layout == DataLayout::kNCHW) {
+      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
+                              h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
+                              h1lambda * w1lambda * out_pos[0]);
+    } else {
+      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
+                              h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
+                              h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(
+          &in_pos[h_id * in_img_w * num_channels + w_id * num_channels],
+          h1lambda * w1lambda * out_pos[0]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeTrilinearInterpFw(
+    const T* in, const size_t in_img_d, const size_t in_img_h,
+    const size_t in_img_w, const size_t input_h, const size_t input_w, T* out,
+    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
+    const size_t output_h, const size_t output_w, const size_t num_channels,
+    const float ratio_d, const float ratio_h, const float ratio_w,
+    const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idt, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
+      out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels);
+      out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) /
+                    (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idt = align_flag
+                         ? static_cast<int>(ratio_d * (out_img_idt + 0.5) - 0.5)
+                         : static_cast<int>(ratio_d * out_img_idt);
+    in_img_idt = (in_img_idt > 0) ? in_img_idt : 0;
+    int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0;
+    T src_d = ratio_d * (out_img_idt + 0.5) - 0.5;
+    src_d = (src_d > 0) ? src_d : 0;
+    T d1lambda =
+        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
+    T d2lambda = 1.f - d1lambda;
+
+    int in_img_idy = align_flag
+                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
+    src_h = (src_h > 0) ? src_h : 0;
+    T h1lambda =
+        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    if (data_layout == DataLayout::kNCHW) {
+      int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
+                        (in_img_idt * in_img_h + in_img_idy) * in_img_w +
+                        in_img_idx;
+      const T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
+      const T* in_pos2 = &in[in_pos2_idx];
+
+      // trilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          d2lambda *
+              (h2lambda * (w2lambda * in_pos1[0] + w1lambda * in_pos1[w_id]) +
+               h1lambda * (w2lambda * in_pos1[h_id * in_img_w] +
+                           w1lambda * in_pos1[h_id * in_img_w + w_id])) +
+          d1lambda *
+              (h2lambda * (w2lambda * in_pos2[0] + w1lambda * in_pos2[w_id]) +
+               h1lambda * (w2lambda * in_pos2[h_id * in_img_w] +
+                           w1lambda * in_pos2[h_id * in_img_w + w_id]));
+
+    } else {
+      int in_pos1_idx = out_id_h * input_w +
+                        in_img_idt * in_img_h * in_img_w * num_channels +
+                        in_img_idy * in_img_w * num_channels +
+                        in_img_idx * num_channels + channel_id;
+      const T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels;
+      const T* in_pos2 = &in[in_pos2_idx];
+
+      // trilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          d2lambda *
+              (h2lambda * (w2lambda * in_pos1[0] +
+                           w1lambda * in_pos1[w_id * num_channels]) +
+               h1lambda * (w2lambda * in_pos1[h_id * in_img_w * num_channels] +
+                           w1lambda * in_pos1[h_id * in_img_w * num_channels +
+                                              w_id * num_channels])) +
+          d1lambda *
+              (h2lambda * (w2lambda * in_pos2[0] +
+                           w1lambda * in_pos2[w_id * num_channels]) +
+               h1lambda * (w2lambda * in_pos2[h_id * in_img_w * num_channels] +
+                           w1lambda * in_pos2[h_id * in_img_w * num_channels +
+                                              w_id * num_channels]));
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeTrilinearInterpBw(
+    T* in, const size_t in_img_d, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, const T* out,
+    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
+    const size_t output_h, const size_t output_w, const size_t num_channels,
+    const T ratio_d, const T ratio_h, const T ratio_w, const bool align_corners,
+    const int align_mode, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idt, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
+      out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels);
+      out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) /
+                    (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idt = align_flag
+                         ? static_cast<int>(ratio_d * (out_img_idt + 0.5) - 0.5)
+                         : static_cast<int>(ratio_d * out_img_idt);
+    in_img_idt = (in_img_idt > 0) ? in_img_idt : 0;
+    int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0;
+    T src_d = ratio_d * (out_img_idt + 0.5) - 0.5;
+    src_d = (src_d > 0) ? src_d : 0;
+    T d1lambda =
+        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
+    T d2lambda = 1.f - d1lambda;
+
+    int in_img_idy = align_flag
+                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
+    src_h = (src_h > 0) ? src_h : 0;
+    T h1lambda =
+        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    if (data_layout == DataLayout::kNCHW) {
+      int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
+                        (in_img_idt * in_img_h + in_img_idy) * in_img_w +
+                        in_img_idx;
+      T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
+      T* in_pos2 = &in[in_pos2_idx];
+
+      const T* out_pos = &out[out_id_h * output_w + out_id_w];
+
+      // trilinear interpolation grad
+      platform::CudaAtomicAdd(&in_pos1[0],
+                              d2lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[w_id],
+                              d2lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
+                              d2lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
+                              d2lambda * h1lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[0],
+                              d1lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[w_id],
+                              d1lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
+                              d1lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
+                              d1lambda * h1lambda * w1lambda * out_pos[0]);
+    } else {
+      int in_pos1_idx = out_id_h * input_w +
+                        in_img_idt * in_img_h * in_img_w * num_channels +
+                        in_img_idy * in_img_w * num_channels +
+                        in_img_idx * num_channels + channel_id;
+      T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels;
+      T* in_pos2 = &in[in_pos2_idx];
+
+      const T* out_pos = &out[out_id_h * output_w + out_id_w];
+
+      // trilinear interpolation grad
+      platform::CudaAtomicAdd(&in_pos1[0],
+                              d2lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[w_id * num_channels],
+                              d2lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
+                              d2lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(
+          &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels],
+          d2lambda * h1lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[0],
+                              d1lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[w_id * num_channels],
+                              d1lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
+                              d1lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(
+          &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels],
+          d1lambda * h1lambda * w1lambda * out_pos[0]);
+    }
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ static T Kecubic_interp(const T x0, const T x1,
+                                                   const T x2, const T x3,
+                                                   T t) {
+  T coeffs[4];
+  T a = -0.75;
+  T x_1 = t;
+  T x_2 = 1.0 - t;
+  coeffs[0] = cubic_convolution2<T>(x_1 + 1.0, a);
+  coeffs[1] = cubic_convolution1<T>(x_1, a);
+  coeffs[2] = cubic_convolution1<T>(x_2, a);
+  coeffs[3] = cubic_convolution2<T>(x_2 + 1.0, a);
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+template <typename T>
+__global__ void KeBicubicInterpFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    T in_img_idy = align_corners
+                       ? static_cast<T>(ratio_h * out_img_idy)
+                       : static_cast<T>(ratio_h * (out_img_idy + 0.5) - 0.5);
+    int input_y = floorf(in_img_idy);
+    const T y_t = in_img_idy - input_y;
+
+    T in_img_idx = align_corners
+                       ? static_cast<T>(ratio_w * out_img_idx)
+                       : static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
+    int input_x = floorf(in_img_idx);
+    const T x_t = in_img_idx - input_x;
+
+    T coefficients[4];
+    const T* in_pos_0;
+    const T* in_pos_1;
+    const T* in_pos_2;
+    const T* in_pos_3;
+    int access_x_0;
+    if (data_layout == DataLayout::kNCHW) {
+      for (int k = 0; k < 4; k++) {
+        int access_y =
+            max(min(input_y - 1 + k, static_cast<int>(in_img_h - 1)), 0);
+        access_x_0 = max(min(input_x - 1, static_cast<int>(in_img_w - 1)), 0);
+        int access_x_1 =
+            max(min(input_x + 0, static_cast<int>(in_img_w - 1)), 0);
+        int access_x_2 =
+            max(min(input_x + 1, static_cast<int>(in_img_w - 1)), 0);
+        int access_x_3 =
+            max(min(input_x + 2, static_cast<int>(in_img_w - 1)), 0);
+
+        in_pos_0 = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x_0];
+        in_pos_1 = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x_1];
+        in_pos_2 = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x_2];
+        in_pos_3 = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x_3];
+
+        coefficients[k] = Kecubic_interp<T>(in_pos_0[0], in_pos_1[0],
+                                            in_pos_2[0], in_pos_3[0], x_t);
+      }
+
+      out[out_id_h * output_w + out_id_w] =
+          Kecubic_interp<T>(coefficients[0], coefficients[1], coefficients[2],
+                            coefficients[3], y_t);
+
+    } else {
+      for (int k = 0; k < 4; k++) {
+        int access_y =
+            max(min(input_y - 1 + k, static_cast<int>((in_img_h - 1))), 0);
+        int access_x_0 =
+            max(min(input_x - 1, static_cast<int>((in_img_w - 1))), 0);
+        int access_x_1 =
+            max(min(input_x + 0, static_cast<int>((in_img_w - 1))), 0);
+        int access_x_2 =
+            max(min(input_x + 1, static_cast<int>((in_img_w - 1))), 0);
+        int access_x_3 =
+            max(min(input_x + 2, static_cast<int>((in_img_w - 1))), 0);
+
+        const T* in_pos_0 =
+            &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                access_x_0 * num_channels + channel_id];
+        const T* in_pos_1 =
+            &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                access_x_1 * num_channels + channel_id];
+        const T* in_pos_2 =
+            &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                access_x_2 * num_channels + channel_id];
+        const T* in_pos_3 =
+            &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                access_x_3 * num_channels + channel_id];
+
+        coefficients[k] = Kecubic_interp(in_pos_0[0], in_pos_1[0], in_pos_2[0],
+                                         in_pos_3[0], x_t);
+      }
+
+      out[out_id_h * output_w + out_id_w] =
+          static_cast<T>(Kecubic_interp(coefficients[0], coefficients[1],
+                                        coefficients[2], coefficients[3], y_t));
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeBicubicInterpBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
+    const size_t input_w, const T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    T in_img_idy = align_corners
+                       ? static_cast<T>(ratio_h * out_img_idy)
+                       : static_cast<T>(ratio_h * (out_img_idy + 0.5) - 0.5);
+    int input_y = floorf(in_img_idy);
+    const T y_t = in_img_idy - input_y;
+
+    T in_img_idx = align_corners
+                       ? static_cast<T>(ratio_w * out_img_idx)
+                       : static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
+    int input_x = floorf(in_img_idx);
+
+    const T x_t = in_img_idx - input_x;
+
+    T x_coeffs[4];
+    T y_coeffs[4];
+
+    get_cubic_upsample_coefficients(x_coeffs, x_t);
+    get_cubic_upsample_coefficients(y_coeffs, y_t);
+
+    const T* out_pos = &out[out_id_h * output_w + out_id_w];
+    T* in_pos;
+
+    for (int i = 0; i < 4; i++) {
+      for (int j = 0; j < 4; j++) {
+        int access_y = max(min(static_cast<int>(input_y - 1 + j),
+                               static_cast<int>(in_img_h - 1)),
+                           0);
+        int access_x = max(min(static_cast<int>(input_x - 1 + i),
+                               static_cast<int>(in_img_w - 1)),
+                           0);
+        if (data_layout == DataLayout::kNCHW) {
+          in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x];
+        } else {
+          in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                       access_x * num_channels + channel_id];
+        }
+        platform::CudaAtomicAdd(&in_pos[0],
+                                (out_pos[0] * y_coeffs[j] * x_coeffs[i]));
+      }
+    }
+  }
+}
+
+template <typename T>
+static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
+                                 const Tensor& input, Tensor* output) {
+  auto* input_data = input.data<T>();
+
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_shape_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_shape_tensor);
+    out_w = new_size[0];
+  } else {
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale_w = scale_data[0];
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+    } else {
+      if (scale.size() > 0) {
+        scale_w = scale[0];
+        PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                                 "scale  of Op(interpolate) "
+                                                 "should be greater than 0."));
+      }
+    }
+    if (scale_w > 0.) {
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      Tensor sizes;
+      framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_w = size_data[0];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_w};
+  } else {
+    dim_out = {n, out_w, c};
+  }
+  auto output_data = output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_w = 0.f;
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1.0) / (out_w - 1.0)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_cw = c * in_w;
+  int out_cw = c * out_w;
+  int pixelNum = n * out_cw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("linear" == interp_method) {
+    KeLinearInterpFw<T><<<config.blocks, config.threads, 0,
+                          ctx.cuda_device_context().stream()>>>(
+        input_data, in_w, in_cw, output_data, out_w, n, out_cw, c, ratio_w,
+        align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
+                                 const Tensor& input, Tensor* output) {
+  auto* input_data = input.data<T>();
+
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_shape_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_shape_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  } else {
+    float scale_h = -1;
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      if (scale_data.size() > 1) {
+        scale_h = scale_data[0];
+        scale_w = scale_data[1];
+      } else {
+        scale_h = scale_data[0];
+        scale_w = scale_data[0];
+      }
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    } else {
+      if (scale.size() > 1) {
+        scale_w = scale[1];
+        scale_h = scale[0];
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0, true,
+            platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                              "should be greater than 0."));
+      }
+    }
+    if (scale_w > 0. && scale_h > 0.) {
+      out_h = static_cast<int>(in_h * scale_h);
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      Tensor sizes;
+      framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
+                                  "out_h in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_h, out_w};
+  } else {
+    dim_out = {n, out_h, out_w, c};
+  }
+  auto output_data = output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_hw = in_h * in_w;
+  int out_hw = out_h * out_w;
+  int in_chw = c * in_hw;
+  int out_chw = c * out_hw;
+
+  int pixelNum = n * out_chw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("nearest" == interp_method) {
+    KeNearestNeighborInterpFw<T><<<config.blocks, config.threads, 0,
+                                   ctx.cuda_device_context().stream()>>>(
+        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+  } else if ("bilinear" == interp_method) {
+    KeBilinearInterpFw<T><<<config.blocks, config.threads, 0,
+                            ctx.cuda_device_context().stream()>>>(
+        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+        out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
+  } else if ("bicubic" == interp_method) {
+    KeBicubicInterpFw<
+        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
+                                 const Tensor& input, Tensor* output) {
+  auto* input_data = input.data<T>();
+
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_d = ctx.Attr<int>("out_d");
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_shape_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_shape_tensor);
+    out_d = new_size[0];
+    out_h = new_size[1];
+    out_w = new_size[2];
+  } else {
+    float scale_d = -1;
+    float scale_h = -1;
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      if (scale_data.size() > 1) {
+        scale_d = scale_data[0];
+        scale_h = scale_data[1];
+        scale_w = scale_data[2];
+      } else {
+        scale_d = scale_data[0];
+        scale_h = scale_data[0];
+        scale_w = scale_data[0];
+      }
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    } else {
+      if (scale.size() > 1) {
+        scale_d = scale[0];
+        scale_h = scale[1];
+        scale_w = scale[2];
+
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+            platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                              "should be greater than 0."));
+      }
+    }
+    if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
+      out_d = static_cast<int>(in_d * scale_d);
+      out_h = static_cast<int>(in_h * scale_h);
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      Tensor sizes;
+      framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_d = size_data[0];
+      out_h = size_data[1];
+      out_w = size_data[2];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument(
+                                  "out_d in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
+                                  "out_h in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_d, out_h, out_w};
+  } else {
+    dim_out = {n, out_d, out_h, out_w, c};
+  }
+  auto output_data = output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_d == out_d && in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_d = 0.f;
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_d > 1) {
+    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
+                              : static_cast<float>(in_d) / out_d;
+  }
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_dhw = in_d * in_h * in_w;
+  int out_dhw = out_d * out_h * out_w;
+  int in_cdhw = c * in_dhw;
+  int out_cdhw = c * out_dhw;
+
+  int pixelNum = n * out_cdhw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("trilinear" == interp_method) {
+    KeTrilinearInterpFw<T><<<config.blocks, config.threads, 0,
+                             ctx.cuda_device_context().stream()>>>(
+        input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
+        out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
+        align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
+                                 Tensor* input_grad, const Tensor output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    scale_w = scale_data[0];
+    PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                             "scale  of Op(interpolate) "
+                                             "should be greater than 0."));
+  } else {
+    if (scale.size() > 0) {
+      scale_w = scale[0];
+
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+    }
+  }
+  if (scale_w > 0.) {
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    Tensor sizes;
+    framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+    auto size_data = sizes.data<int>();
+    out_w = size_data[0];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_w = new_size[0];
+  }
+
+  auto* output_grad_data = output_grad.data<T>();
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_w};
+  } else {
+    dim_grad = {n, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_w = 0.f;
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+  int in_cw = c * in_w;
+  int out_cw = c * out_w;
+  int pixelNum = n * out_cw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("linear" == interp_method) {
+    KeLinearInterpBw<T><<<config.blocks, config.threads, 0,
+                          ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_w, in_cw, output_grad_data, out_w, n, out_cw, c,
+        ratio_w, align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
+                                 Tensor* input_grad, const Tensor output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_h = -1;
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    if (scale_data.size() > 1) {
+      scale_h = scale_data[0];
+      scale_w = scale_data[1];
+    } else {
+      scale_h = scale_data[0];
+      scale_w = scale_data[0];
+    }
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0 && scale_h > 0, true,
+        platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                          "should be greater than 0."));
+  } else {
+    if (scale.size() > 1) {
+      scale_w = scale[1];
+      scale_h = scale[0];
+
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    }
+  }
+  if (scale_w > 0. && scale_h > 0.) {
+    out_h = static_cast<int>(in_h * scale_h);
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    Tensor sizes;
+    framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+    auto size_data = sizes.data<int>();
+    out_h = size_data[0];
+    out_w = size_data[1];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  }
+
+  auto* output_grad_data = output_grad.data<T>();
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_h, in_w};
+  } else {
+    dim_grad = {n, in_h, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_hw = in_h * in_w;
+  int out_hw = out_h * out_w;
+  int in_chw = c * in_hw;
+  int out_chw = c * out_hw;
+
+  int pixelNum = n * out_chw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("nearest" == interp_method) {
+    KeNearestNeighborInterpBw<T><<<config.blocks, config.threads, 0,
+                                   ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
+        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+  } else if ("bilinear" == interp_method) {
+    KeBilinearInterpBw<T><<<config.blocks, config.threads, 0,
+                            ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
+        n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
+        data_layout);
+  } else if ("bicubic" == interp_method) {
+    KeBicubicInterpBw<
+        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
+        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
+                                 Tensor* input_grad,
+                                 const Tensor& output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_d = ctx.Attr<int>("out_d");
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_d = -1;
+  float scale_h = -1;
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    if (scale_data.size() > 1) {
+      scale_d = scale_data[0];
+      scale_h = scale_data[1];
+      scale_w = scale_data[2];
+    } else {
+      scale_d = scale_data[0];
+      scale_h = scale_data[0];
+      scale_w = scale_data[0];
+    }
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+        platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                          "should be greater than 0."));
+  } else {
+    if (scale.size() > 1) {
+      scale_d = scale[0];
+      scale_h = scale[1];
+      scale_w = scale[2];
+
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    }
+  }
+  if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
+    out_d = static_cast<int>(in_d * scale_d);
+    out_h = static_cast<int>(in_h * scale_h);
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    Tensor sizes;
+    framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+    auto size_data = sizes.data<int>();
+    out_d = size_data[0];
+    out_h = size_data[1];
+    out_w = size_data[2];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_d = new_size[0];
+    out_h = new_size[1];
+    out_w = new_size[2];
+  }
+
+  auto* output_grad_data = output_grad.data<T>();
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_d, in_h, in_w};
+  } else {
+    dim_grad = {n, in_d, in_h, in_w, c};
+  }
+  auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_d == out_d && in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_d = 0.f;
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_d > 1) {
+    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
+                              : static_cast<float>(in_d) / out_d;
+  }
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_dhw = in_d * in_h * in_w;
+  int out_dhw = out_d * out_h * out_w;
+  int in_cdhw = c * in_dhw;
+  int out_cdhw = c * out_dhw;
+
+  int pixelNum = n * out_cdhw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("trilinear" == interp_method) {
+    KeTrilinearInterpBw<T><<<config.blocks, config.threads, 0,
+                             ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
+        out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
+        align_mode, data_layout);
+  }
+}
+
+template <typename T>
+class InterpolateOpV2CUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::NotFound("This kernel only runs on GPU device."));
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    auto input_dims = input->dims();
+    if (input_dims.size() == 3) {  // 1D interpolation
+      Interpolate1DCUDAFwd<T>(ctx, *input, output);
+    } else if (input_dims.size() == 4) {  // 2D interpolation
+      Interpolate2DCUDAFwd<T>(ctx, *input, output);
+    } else if (input_dims.size() == 5) {  // 3D interpolation
+      Interpolate3DCUDAFwd<T>(ctx, *input, output);
+    }
+  }
+};
+
+template <typename T>
+class InterpolateV2GradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::NotFound("This kernel only runs on GPU device."));
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto output_grad_dims = output_grad->dims();
+    if (output_grad_dims.size() == 3) {  // 1D interpolation
+      Interpolate1DCUDABwd<T>(ctx, input_grad, *output_grad);
+    } else if (output_grad_dims.size() == 4) {  // 2D interpolation
+      Interpolate2DCUDABwd<T>(ctx, input_grad, *output_grad);
+    } else if (output_grad_dims.size() == 5) {  // 3D interpolation
+      Interpolate3DCUDABwd<T>(ctx, input_grad, *output_grad);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(bilinear_interp_v2,
+                        ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(bilinear_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(nearest_interp_v2,
+                        ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(nearest_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(trilinear_interp_v2,
+                        ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(trilinear_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(linear_interp_v2, ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(linear_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(bicubic_interp_v2,
+                        ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(bicubic_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..111766934b8300c0a7b46ae9a065b8c42460e577
--- /dev/null
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -0,0 +1,1386 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+inline std::vector<int> get_new_shape(
+    const std::vector<const Tensor*>& list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    PADDLE_ENFORCE_EQ(
+        tensor->dims(), framework::make_ddim({1}),
+        platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
+    if (platform::is_gpu_place(tensor->place())) {
+      framework::Tensor temp;
+      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+      vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
+    } else {
+      vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
+    }
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  framework::Tensor cpu_starts_tensor;
+  if (platform::is_gpu_place(new_data_tensor->place())) {
+    TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor);
+    new_data = cpu_starts_tensor.data<T>();
+  }
+  vec_new_data = std::vector<T>(new_data, new_data + new_data_tensor->numel());
+  return vec_new_data;
+}
+
+inline void ExtractNCDWH(const framework::DDim& dims,
+                         const DataLayout& data_layout, int* N, int* C, int* D,
+                         int* H, int* W) {
+  *N = dims[0];
+
+  if (dims.size() == 3) {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[2];
+    *D = 1;
+    *H = 1;
+    *W = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+  } else if (dims.size() == 4) {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[3];
+    *D = 1;
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *W = data_layout == DataLayout::kNCHW ? dims[3] : dims[2];
+  } else {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[4];
+    *D = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *H = data_layout == DataLayout::kNCHW ? dims[3] : dims[2];
+    *W = data_layout == DataLayout::kNCHW ? dims[4] : dims[3];
+  }
+}
+
+template <typename T>
+static void NearestNeighborInterpolate(const Tensor& input, Tensor* output,
+                                       const float ratio_h, const float ratio_w,
+                                       const int n, const int c,
+                                       const int out_h, const int out_w,
+                                       const bool align_corners,
+                                       const DataLayout& data_layout) {
+  auto input_t = EigenTensor<T, 4>::From(input);
+  auto output_t = EigenTensor<T, 4>::From(*output);
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
+                               : static_cast<int>(ratio_h * k);
+
+    for (int l = 0; l < out_w; l++) {
+      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
+                                 : static_cast<int>(ratio_w * l);
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          if (data_layout == DataLayout::kNCHW) {
+            output_t(i, j, k, l) = input_t(i, j, in_k, in_l);
+          } else {
+            output_t(i, k, l, j) = input_t(i, in_k, in_l, j);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void LinearInterpolation(const Tensor& input, Tensor* output,
+                                const float ratio_w, const int in_w,
+                                const int n, const int c, const int out_w,
+                                const bool align_corners, const bool align_mode,
+                                const DataLayout data_layout) {
+  auto input_t = EigenTensor<T, 3>::From(input);
+  auto output_t = EigenTensor<T, 3>::From(*output);
+  bool align_flag = (align_mode == 0 && !align_corners);
+
+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int l = 0; l < out_w; l++) {
+    int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * l);
+    x_w = (x_w > 0) ? x_w : 0;                       // w
+    int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w;  // w_id
+
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;  // w1lambda
+    float d_e = 1.f - d_w;                                         // w2lambda
+    {
+      vx_w[l] = x_w;
+      vx_e[l] = x_e;
+      vd_w[l] = d_w;
+      vd_e[l] = d_e;
+    }
+  }
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+  for (int i = 0; i < n; i++) {    // loop for batches
+    for (int j = 0; j < c; j++) {  // loop for channels
+      for (int l = 0; l < out_w; l++) {
+        // linear interpolation
+        T out_t;
+        if (data_layout == DataLayout::kNCHW) {
+          out_t = input_t(i, j, vx_w[l]) * vd_e[l] +
+                  input_t(i, j, vx_e[l]) * vd_w[l];
+          output_t(i, j, l) = out_t;
+        } else {
+          out_t = input_t(i, vx_w[l], j) * vd_e[l] +
+                  input_t(i, vx_e[l], j) * vd_w[l];
+          output_t(i, l, j) = out_t;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void LinearInterpolationGrad(const Tensor& output_grad,
+                                    Tensor* input_grad, const float ratio_w,
+                                    const int in_w, const int n, const int c,
+                                    const int out_w, const bool align_corners,
+                                    const int align_mode,
+                                    const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 3>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 3>::From(output_grad);
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (int l = 0; l < out_w; l++) {
+    int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * l);
+    x_w = (x_w > 0) ? x_w : 0;                       // w
+    int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w;  // w_id
+
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;  // w1lambda
+    float d_e = 1.f - d_w;                                         // w2lambda
+
+    for (int i = 0; i < n; i++) {    // loop for batches
+      for (int j = 0; j < c; j++) {  // loop for channels
+        // linear interpolation grad
+        if (data_layout == DataLayout::kNCHW) {
+          const T grad = output_grad_t(i, j, l);
+          input_grad_t(i, j, x_w) += static_cast<T>(grad * d_e);
+          input_grad_t(i, j, x_e) += static_cast<T>(grad * d_w);
+        } else {
+          const T grad = output_grad_t(i, l, j);
+          input_grad_t(i, x_w, j) += static_cast<T>(grad * d_e);
+          input_grad_t(i, x_e, j) += static_cast<T>(grad * d_w);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void BilinearInterpolation(const Tensor& input, Tensor* output,
+                                  const float ratio_h, const float ratio_w,
+                                  const int in_h, const int in_w, const int n,
+                                  const int c, const int out_h, const int out_w,
+                                  const bool align_corners,
+                                  const bool align_mode,
+                                  const DataLayout data_layout) {
+  auto input_t = EigenTensor<T, 4>::From(input);
+  auto output_t = EigenTensor<T, 4>::From(*output);
+  bool align_flag = (align_mode == 0 && !align_corners);
+
+  std::vector<int> vy_n, vy_s;
+  std::vector<float> vd_n, vd_s;
+  vy_n.reserve(out_h);
+  vy_s.reserve(out_h);
+  vd_n.reserve(out_h);
+  vd_s.reserve(out_h);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int k = 0; k < out_h; k++) {
+    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * k);
+    y_n = (y_n > 0) ? y_n : 0;
+    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+    float d_s = 1.f - d_n;
+    {
+      vy_n[k] = y_n;
+      vy_s[k] = y_s;
+      vd_n[k] = d_n;
+      vd_s[k] = d_s;
+    }
+  }
+
+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int l = 0; l < out_w; l++) {
+    int x_w = (align_mode == 0 && !align_corners)
+                  ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                  : static_cast<int>(ratio_w * l);
+    x_w = (x_w > 0) ? x_w : 0;
+    int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+    float d_e = 1.f - d_w;
+    {
+      vx_w[l] = x_w;
+      vx_e[l] = x_e;
+      vd_w[l] = d_w;
+      vd_e[l] = d_e;
+    }
+  }
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(4)
+#endif
+  for (int i = 0; i < n; i++) {          // loop for batches
+    for (int j = 0; j < c; j++) {        // loop for channels
+      for (int k = 0; k < out_h; k++) {  // loop for images
+        for (int l = 0; l < out_w; l++) {
+          // bilinear interpolation
+          T out_t;
+          if (data_layout == DataLayout::kNCHW) {
+            out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] +
+                    input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] +
+                    input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] +
+                    input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l];
+            output_t(i, j, k, l) = out_t;
+
+          } else {
+            out_t = input_t(i, vy_n[k], vx_w[l], j) * vd_s[k] * vd_e[l] +
+                    input_t(i, vy_s[k], vx_w[l], j) * vd_n[k] * vd_e[l] +
+                    input_t(i, vy_n[k], vx_e[l], j) * vd_s[k] * vd_w[l] +
+                    input_t(i, vy_s[k], vx_e[l], j) * vd_n[k] * vd_w[l];
+            output_t(i, k, l, j) = out_t;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void TrilinearInterpolation(
+    const Tensor& input, Tensor* output, const float ratio_d,
+    const float ratio_h, const float ratio_w, const int in_d, const int in_h,
+    const int in_w, const int n, const int c, const int out_d, const int out_h,
+    const int out_w, const bool align_corners, const bool align_mode,
+    const DataLayout& data_layout) {
+  auto input_t = EigenTensor<T, 5>::From(input);
+  auto output_t = EigenTensor<T, 5>::From(*output);
+  bool align_flag = (align_mode == 0 && !align_corners);
+
+  std::vector<int> vt_f, vt_b;
+  std::vector<float> vd_f, vd_b;
+  vt_f.reserve(out_d);
+  vt_b.reserve(out_d);
+  vd_f.reserve(out_d);
+  vd_b.reserve(out_d);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int j = 0; j < out_d; j++) {
+    int t_f = align_flag ? static_cast<int>(ratio_d * (j + 0.5) - 0.5)
+                         : static_cast<int>(ratio_d * j);
+    t_f = (t_f > 0) ? t_f : 0;
+    int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1);
+    float idx_src_t = ratio_d * (j + 0.5) - 0.5;
+    idx_src_t = (idx_src_t > 0) ? idx_src_t : 0;
+    float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f;
+    float d_b = 1.f - d_f;
+    {
+      vt_f[j] = t_f;
+      vt_b[j] = t_b;
+      vd_f[j] = d_f;
+      vd_b[j] = d_b;
+    }
+  }
+
+  std::vector<int> vy_n, vy_s;
+  std::vector<float> vd_n, vd_s;
+  vy_n.reserve(out_h);
+  vy_s.reserve(out_h);
+  vd_n.reserve(out_h);
+  vd_s.reserve(out_h);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int k = 0; k < out_h; k++) {
+    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * k);
+    y_n = (y_n > 0) ? y_n : 0;
+    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+    float d_s = 1.f - d_n;
+    {
+      vy_n[k] = y_n;
+      vy_s[k] = y_s;
+      vd_n[k] = d_n;
+      vd_s[k] = d_s;
+    }
+  }
+
+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int l = 0; l < out_w; l++) {
+    int x_w = (align_mode == 0 && !align_corners)
+                  ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                  : static_cast<int>(ratio_w * l);
+    x_w = (x_w > 0) ? x_w : 0;
+    int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+    float d_e = 1.f - d_w;
+    {
+      vx_w[l] = x_w;
+      vx_e[l] = x_e;
+      vd_w[l] = d_w;
+      vd_e[l] = d_e;
+    }
+  }
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(5)
+#endif
+  for (int b = 0; b < n; b++) {          // loop for batches
+    for (int i = 0; i < c; i++) {        // loop for channels
+      for (int j = 0; j < out_d; j++) {  // loop for D, H, W
+        for (int k = 0; k < out_h; k++) {
+          for (int l = 0; l < out_w; l++) {
+            // trilinear interpolation
+            if (data_layout == DataLayout::kNCHW) {
+              T out_t = input_t(b, i, vt_f[j], vy_n[k], vx_w[l]) * vd_b[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, i, vt_f[j], vy_n[k], vx_e[l]) * vd_b[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, i, vt_f[j], vy_s[k], vx_w[l]) * vd_b[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, i, vt_f[j], vy_s[k], vx_e[l]) * vd_b[j] *
+                            vd_n[k] * vd_w[l] +
+                        input_t(b, i, vt_b[j], vy_n[k], vx_w[l]) * vd_f[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, i, vt_b[j], vy_n[k], vx_e[l]) * vd_f[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, i, vt_b[j], vy_s[k], vx_w[l]) * vd_f[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, i, vt_b[j], vy_s[k], vx_e[l]) * vd_f[j] *
+                            vd_n[k] * vd_w[l];
+              output_t(b, i, j, k, l) = out_t;
+            } else {
+              T out_t = input_t(b, vt_f[j], vy_n[k], vx_w[l], i) * vd_b[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, vt_f[j], vy_n[k], vx_e[l], i) * vd_b[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, vt_f[j], vy_s[k], vx_w[l], i) * vd_b[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, vt_f[j], vy_s[k], vx_e[l], i) * vd_b[j] *
+                            vd_n[k] * vd_w[l] +
+                        input_t(b, vt_b[j], vy_n[k], vx_w[l], i) * vd_f[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, vt_b[j], vy_n[k], vx_e[l], i) * vd_f[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, vt_b[j], vy_s[k], vx_w[l], i) * vd_f[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, vt_b[j], vy_s[k], vx_e[l], i) * vd_f[j] *
+                            vd_n[k] * vd_w[l];
+              output_t(b, j, k, l, i) = out_t;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+HOSTDEVICE inline T cubic_convolution1(T x, T A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+template <typename T>
+HOSTDEVICE inline T cubic_convolution2(T x, T A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+template <typename T>
+HOSTDEVICE inline void get_cubic_upsample_coefficients(T coeffs[4], T t) {
+  T A = -0.75;
+
+  T x1 = t;
+  coeffs[0] = cubic_convolution2<T>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<T>(x1, A);
+
+  // opposite coefficients
+  T x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<T>(x2, A);
+  coeffs[3] = cubic_convolution2<T>(x2 + 1.0, A);
+}
+
+template <typename T>
+static inline T cubic_interp(T x0, T x1, T x2, T x3, T t) {
+  T coeffs[4];
+  get_cubic_upsample_coefficients<T>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+template <typename T>
+static void BicubicInterpolation(const Tensor& input, Tensor* output,
+                                 const float ratio_h, const float ratio_w,
+                                 const int in_h, const int in_w, const int n,
+                                 const int c, const int out_h, const int out_w,
+                                 const bool align_corners,
+                                 const DataLayout data_layout) {
+  auto input_t = EigenTensor<T, 4>::From(input);
+  auto output_t = EigenTensor<T, 4>::From(*output);
+
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    T y_n = align_corners ? static_cast<T>(ratio_h * k)
+                          : static_cast<T>(ratio_h * (k + 0.5) - 0.5);
+    int input_y = floorf(y_n);
+    const T y_t = y_n - input_y;
+
+    for (int l = 0; l < out_w; l++) {
+      T x_n = align_corners ? static_cast<T>(ratio_w * l)
+                            : static_cast<T>(ratio_w * (l + 0.5) - 0.5);
+      int input_x = floorf(x_n);
+      const T x_t = x_n - input_x;
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          T coefficients[4];
+          // interp 4 times in x direction
+          for (int ii = 0; ii < 4; ii++) {
+            int access_y = std::max(std::min(input_y - 1 + ii, in_h - 1),
+                                    static_cast<int>(0));
+            int access_x_0 =
+                std::max(std::min(input_x - 1, in_w - 1), static_cast<int>(0));
+            int access_x_1 =
+                std::max(std::min(input_x + 0, in_w - 1), static_cast<int>(0));
+            int access_x_2 =
+                std::max(std::min(input_x + 1, in_w - 1), static_cast<int>(0));
+            int access_x_3 =
+                std::max(std::min(input_x + 2, in_w - 1), static_cast<int>(0));
+            if (data_layout == DataLayout::kNCHW) {
+              coefficients[ii] =
+                  cubic_interp<T>(input_t(i, j, access_y, access_x_0),
+                                  input_t(i, j, access_y, access_x_1),
+                                  input_t(i, j, access_y, access_x_2),
+                                  input_t(i, j, access_y, access_x_3), x_t);
+            } else {
+              coefficients[ii] =
+                  cubic_interp<T>(input_t(i, access_y, access_x_0, j),
+                                  input_t(i, access_y, access_x_1, j),
+                                  input_t(i, access_y, access_x_2, j),
+                                  input_t(i, access_y, access_x_3, j), x_t);
+            }
+          }
+
+          // interp y direction
+          if (data_layout == DataLayout::kNCHW) {
+            output_t(i, j, k, l) =
+                cubic_interp<T>(coefficients[0], coefficients[1],
+                                coefficients[2], coefficients[3], y_t);
+          } else {
+            output_t(i, k, l, j) =
+                cubic_interp<T>(coefficients[0], coefficients[1],
+                                coefficients[2], coefficients[3], y_t);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void NearestNeighborInterpolateGrad(
+    const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
+    const float ratio_w, const int n, const int c, const int out_h,
+    const int out_w, const bool align_corners, const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
+                               : static_cast<int>(ratio_h * k);
+
+    for (int l = 0; l < out_w; l++) {
+      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
+                                 : static_cast<int>(ratio_w * l);
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          if (data_layout == DataLayout::kNCHW) {
+            input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l);
+          } else {
+            input_grad_t(i, in_k, in_l, j) += output_grad_t(i, k, l, j);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void BilinearInterpolationGrad(
+    const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
+    const float ratio_w, const int in_h, const int in_w, const int n,
+    const int c, const int out_h, const int out_w, const bool align_corners,
+    const int align_mode, const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * k);
+    y_n = (y_n > 0) ? y_n : 0;
+    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+    float d_s = 1.f - d_n;
+
+    for (int l = 0; l < out_w; l++) {
+      int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                           : static_cast<int>(ratio_w * l);
+      x_w = (x_w > 0) ? x_w : 0;
+      int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+      float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+      idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+      float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+      float d_e = 1.f - d_w;
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          // bilinear interpolation grad
+          if (data_layout == DataLayout::kNCHW) {
+            const T grad = output_grad_t(i, j, k, l);
+            input_grad_t(i, j, y_n, x_w) += static_cast<T>(grad * d_s * d_e);
+            input_grad_t(i, j, y_s, x_w) += static_cast<T>(grad * d_n * d_e);
+            input_grad_t(i, j, y_n, x_e) += static_cast<T>(grad * d_s * d_w);
+            input_grad_t(i, j, y_s, x_e) += static_cast<T>(grad * d_n * d_w);
+          } else {
+            const T grad = output_grad_t(i, k, l, j);
+            input_grad_t(i, y_n, x_w, j) += static_cast<T>(grad * d_s * d_e);
+            input_grad_t(i, y_s, x_w, j) += static_cast<T>(grad * d_n * d_e);
+            input_grad_t(i, y_n, x_e, j) += static_cast<T>(grad * d_s * d_w);
+            input_grad_t(i, y_s, x_e, j) += static_cast<T>(grad * d_n * d_w);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void TrilinearInterpolationGrad(
+    const Tensor& output_grad, Tensor* input_grad, const float ratio_d,
+    const float ratio_h, const float ratio_w, const int in_d, const int in_h,
+    const int in_w, const int n, const int c, const int out_d, const int out_h,
+    const int out_w, const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 5>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 5>::From(output_grad);
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (int j = 0; j < out_d; j++) {  // loop for D
+    int t_f = align_flag ? static_cast<int>(ratio_d * (j + 0.5) - 0.5)
+                         : static_cast<int>(ratio_d * j);
+    t_f = (t_f > 0) ? t_f : 0;
+    int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1);
+    float idx_src_t = ratio_d * (j + 0.5) - 0.5;
+    idx_src_t = (idx_src_t > 0) ? idx_src_t : 0;
+    float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f;
+    float d_b = 1.f - d_f;
+
+    for (int k = 0; k < out_h; k++) {  // loop for H
+      int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                           : static_cast<int>(ratio_h * k);
+      y_n = (y_n > 0) ? y_n : 0;
+      int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+      float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+      idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+      float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+      float d_s = 1.f - d_n;
+
+      for (int l = 0; l < out_w; l++) {  // loop for W
+        int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                             : static_cast<int>(ratio_w * l);
+        x_w = (x_w > 0) ? x_w : 0;
+        int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+        float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+        idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+        float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+        float d_e = 1.f - d_w;
+
+        for (int b = 0; b < n; b++) {    // loop for batches
+          for (int i = 0; i < c; i++) {  // loop for channels
+            // trilinear interpolation grad
+            if (data_layout == DataLayout::kNCHW) {
+              const T grad = output_grad_t(b, i, j, k, l);
+              input_grad_t(b, i, t_f, y_n, x_w) +=
+                  static_cast<T>(grad * d_b * d_s * d_e);
+              input_grad_t(b, i, t_f, y_n, x_e) +=
+                  static_cast<T>(grad * d_b * d_s * d_w);
+              input_grad_t(b, i, t_f, y_s, x_w) +=
+                  static_cast<T>(grad * d_b * d_n * d_e);
+              input_grad_t(b, i, t_f, y_s, x_e) +=
+                  static_cast<T>(grad * d_b * d_n * d_w);
+              input_grad_t(b, i, t_b, y_n, x_w) +=
+                  static_cast<T>(grad * d_f * d_s * d_e);
+              input_grad_t(b, i, t_b, y_n, x_e) +=
+                  static_cast<T>(grad * d_f * d_s * d_w);
+              input_grad_t(b, i, t_b, y_s, x_w) +=
+                  static_cast<T>(grad * d_f * d_n * d_e);
+              input_grad_t(b, i, t_b, y_s, x_e) +=
+                  static_cast<T>(grad * d_f * d_n * d_w);
+            } else {
+              const T grad = output_grad_t(b, j, k, l, i);
+              input_grad_t(b, t_f, y_n, x_w, i) +=
+                  static_cast<T>(grad * d_b * d_s * d_e);
+              input_grad_t(b, t_f, y_n, x_e, i) +=
+                  static_cast<T>(grad * d_b * d_s * d_w);
+              input_grad_t(b, t_f, y_s, x_w, i) +=
+                  static_cast<T>(grad * d_b * d_n * d_e);
+              input_grad_t(b, t_f, y_s, x_e, i) +=
+                  static_cast<T>(grad * d_b * d_n * d_w);
+              input_grad_t(b, t_b, y_n, x_w, i) +=
+                  static_cast<T>(grad * d_f * d_s * d_e);
+              input_grad_t(b, t_b, y_n, x_e, i) +=
+                  static_cast<T>(grad * d_f * d_s * d_w);
+              input_grad_t(b, t_b, y_s, x_w, i) +=
+                  static_cast<T>(grad * d_f * d_n * d_e);
+              input_grad_t(b, t_b, y_s, x_e, i) +=
+                  static_cast<T>(grad * d_f * d_n * d_w);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void BicubicInterpolationGrad(const Tensor& output_grad,
+                                     Tensor* input_grad, const float ratio_h,
+                                     const float ratio_w, const int in_h,
+                                     const int in_w, const int n, const int c,
+                                     const int out_h, const int out_w,
+                                     const bool align_corners,
+                                     const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    T y_n = align_corners ? static_cast<T>(ratio_h * k)
+                          : static_cast<T>(ratio_h * (k + 0.5) - 0.5);
+    int input_y = floorf(y_n);
+    T y_t = y_n - input_y;
+
+    for (int l = 0; l < out_w; l++) {
+      T x_n = align_corners ? static_cast<T>(ratio_w * l)
+                            : static_cast<T>(ratio_w * (l + 0.5) - 0.5);
+      int input_x = floorf(x_n);
+      T x_t = x_n - input_x;
+
+      T x_coeffs[4];
+      T y_coeffs[4];
+
+      get_cubic_upsample_coefficients<T>(x_coeffs, x_t);
+      get_cubic_upsample_coefficients<T>(y_coeffs, y_t);
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          // bicubic interpolation grad
+          for (int ii = 0; ii < 4; ii++) {
+            for (int jj = 0; jj < 4; jj++) {
+              int access_x = std::max(std::min(input_x - 1 + ii, in_w - 1),
+                                      static_cast<int>(0));
+              int access_y = std::max(std::min(input_y - 1 + jj, in_h - 1),
+                                      static_cast<int>(0));
+              if (data_layout == DataLayout::kNCHW) {
+                T grad = output_grad_t(i, j, k, l);
+                input_grad_t(i, j, access_y, access_x) +=
+                    grad * y_coeffs[jj] * x_coeffs[ii];
+              } else {
+                T grad = output_grad_t(i, k, l, j);
+                input_grad_t(i, access_y, access_x, j) +=
+                    grad * y_coeffs[jj] * x_coeffs[ii];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
+                                const Tensor& input, Tensor* output) {
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_w = ctx.Attr<int>("out_w");
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_w = new_size[0];
+  } else {
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale_w = scale_data[0];
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+    } else {
+      if (scale.size() > 0) {
+        scale_w = scale[0];
+
+        PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                                 "scale  of Op(interpolate) "
+                                                 "should be greater than 0."));
+      }
+    }
+    if (scale_w > 0.) {
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_w = out_size_data[0];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_w};
+  } else {
+    dim_out = {n, out_w, c};
+  }
+  output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_w = 0.f;
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+  if ("linear" == interp_method) {
+    LinearInterpolation<T>(input, output, ratio_w, in_w, n, c, out_w,
+                           align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
+                                const Tensor& input, Tensor* output) {
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  } else {
+    float scale_h = -1;
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      if (scale_data.size() > 1) {
+        scale_h = scale_data[0];
+        scale_w = scale_data[1];
+      } else {
+        scale_h = scale_data[0];
+        scale_w = scale_data[0];
+      }
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    } else {
+      if (scale.size() > 1) {
+        scale_h = scale[0];
+        scale_w = scale[1];
+
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0, true,
+            platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                              "should be greater than 0."));
+      }
+    }
+    if (scale_h > 0. && scale_w > 0.) {
+      out_h = static_cast<int>(in_h * scale_h);
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
+                                  "out_h in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_h, out_w};
+  } else {
+    dim_out = {n, out_h, out_w, c};
+  }
+  output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  if ("bilinear" == interp_method) {
+    BilinearInterpolation<T>(input, output, ratio_h, ratio_w, in_h, in_w, n, c,
+                             out_h, out_w, align_corners, align_mode,
+                             data_layout);
+  } else if ("nearest" == interp_method) {
+    NearestNeighborInterpolate<T>(input, output, ratio_h, ratio_w, n, c, out_h,
+                                  out_w, align_corners, data_layout);
+  } else if ("bicubic" == interp_method) {
+    BicubicInterpolation<T>(input, output, ratio_h, ratio_w, in_h, in_w, n, c,
+                            out_h, out_w, align_corners, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
+                                const Tensor& input, Tensor* output) {
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_d = ctx.Attr<int>("out_d");
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_d = new_size[0];
+    out_h = new_size[1];
+    out_w = new_size[2];
+  } else {
+    float scale_d = -1;
+    float scale_h = -1;
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      if (scale_data.size() > 1) {
+        scale_d = scale_data[0];
+        scale_h = scale_data[1];
+        scale_w = scale_data[2];
+      } else {
+        scale_d = scale_data[0];
+        scale_h = scale_data[0];
+        scale_w = scale_data[0];
+      }
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    } else {
+      if (scale.size() > 1) {
+        scale_d = scale[0];
+        scale_h = scale[1];
+        scale_w = scale[2];
+
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0 && scale_d, true,
+            platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                              "should be greater than 0."));
+      }
+    }
+    if (scale_w > 0. && scale_h > 0. && scale_d > 0.) {
+      out_d = static_cast<int>(in_d * scale_d);
+      out_h = static_cast<int>(in_h * scale_h);
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_d = out_size_data[0];
+      out_h = out_size_data[1];
+      out_w = out_size_data[2];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument(
+                                  "out_d in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
+                                  "out_h in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_d, out_h, out_w};
+  } else {
+    dim_out = {n, out_d, out_h, out_w, c};
+  }
+
+  output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_d == out_d && in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_d = 0.f;
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_d > 1) {
+    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
+                              : static_cast<float>(in_d) / out_d;
+  }
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  if ("trilinear" == interp_method) {
+    TrilinearInterpolation<T>(input, output, ratio_d, ratio_h, ratio_w, in_d,
+                              in_h, in_w, n, c, out_d, out_h, out_w,
+                              align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
+                                Tensor* input_grad, const Tensor& output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_w = -1.0;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    scale_w = scale_data[0];
+    PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                             "scale  of Op(interpolate) "
+                                             "should be greater than 0."));
+  } else {
+    if (scale.size() > 0) {
+      scale_w = scale[0];
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+    }
+  }
+  if (scale_w > 0.) {
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    auto out_size_data = get_new_data_from_tensor<int>(out_size);
+    out_w = out_size_data[0];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_w = new_size[0];
+  }
+
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_w};
+  } else {
+    dim_grad = {n, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+
+  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_w = 0.f;
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+  if ("linear" == interp_method) {
+    LinearInterpolationGrad<T>(output_grad, input_grad, ratio_w, in_w, n, c,
+                               out_w, align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
+                                Tensor* input_grad, const Tensor& output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_h = -1;
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    if (scale_data.size() > 1) {
+      scale_h = scale_data[0];
+      scale_w = scale_data[1];
+    } else {
+      scale_w = scale_data[0];
+      scale_h = scale_data[0];
+    }
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0 && scale_h > 0, true,
+        platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                          "should be greater than 0."));
+  } else {
+    if (scale.size() > 1) {
+      scale_h = scale[0];
+      scale_w = scale[1];
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    }
+  }
+  if (scale_h > 0. && scale_w > 0.) {
+    out_h = static_cast<int>(in_h * scale_h);
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    auto out_size_data = get_new_data_from_tensor<int>(out_size);
+    out_h = out_size_data[0];
+    out_w = out_size_data[1];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  }
+
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_h, in_w};
+  } else {
+    dim_grad = {n, in_h, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+
+  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  if ("bilinear" == interp_method) {
+    BilinearInterpolationGrad<T>(output_grad, input_grad, ratio_h, ratio_w,
+                                 in_h, in_w, n, c, out_h, out_w, align_corners,
+                                 align_mode, data_layout);
+  } else if ("nearest" == interp_method) {
+    NearestNeighborInterpolateGrad<T>(output_grad, input_grad, ratio_h, ratio_w,
+                                      n, c, out_h, out_w, align_corners,
+                                      data_layout);
+  } else if ("bicubic" == interp_method) {
+    BicubicInterpolationGrad<T>(output_grad, input_grad, ratio_h, ratio_w, in_h,
+                                in_w, n, c, out_h, out_w, align_corners,
+                                data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
+                                Tensor* input_grad, const Tensor output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_d = ctx.Attr<int>("out_d");
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_d = -1;
+  float scale_h = -1;
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    if (scale_data.size() > 1) {
+      scale_d = scale_data[0];
+      scale_h = scale_data[1];
+      scale_w = scale_data[2];
+    } else {
+      scale_d = scale_data[0];
+      scale_h = scale_data[0];
+      scale_w = scale_data[0];
+    }
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+        platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                          "should be greater than 0."));
+  } else {
+    if (scale.size() > 1) {
+      scale_d = scale[0];
+      scale_h = scale[1];
+      scale_w = scale[2];
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    }
+  }
+  if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
+    out_d = static_cast<int>(in_d * scale_d);
+    out_h = static_cast<int>(in_h * scale_h);
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    auto out_size_data = get_new_data_from_tensor<int>(out_size);
+    out_d = out_size_data[0];
+    out_h = out_size_data[1];
+    out_w = out_size_data[2];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_d = new_size[0];
+    out_h = new_size[1];
+    out_w = new_size[2];
+  }
+
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_d, in_h, in_w};
+  } else {
+    dim_grad = {n, in_d, in_h, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_d == out_d && in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_d = 0.f;
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_d > 1) {
+    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
+                              : static_cast<float>(in_d) / out_d;
+  }
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  if ("trilinear" == interp_method) {
+    TrilinearInterpolationGrad<T>(
+        output_grad, input_grad, ratio_d, ratio_h, ratio_w, in_d, in_h, in_w, n,
+        c, out_d, out_h, out_w, align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+class InterpolateV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    auto input_dims = input->dims();
+    if (input_dims.size() == 3) {  // 1D interpolation
+      Interpolate1DCPUFwd<T>(ctx, *input, output);
+    } else if (input_dims.size() == 4) {  // 2D interpolation
+      Interpolate2DCPUFwd<T>(ctx, *input, output);
+    } else if (input_dims.size() == 5) {  // 3D interpolation
+      Interpolate3DCPUFwd<T>(ctx, *input, output);
+    }
+  }
+};
+
+template <typename T>
+class InterpolateV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto output_grad_dims = output_grad->dims();
+    if (output_grad_dims.size() == 3) {  // 1D interpolation grad
+      Interpolate1DCPUBwd<T>(ctx, input_grad, *output_grad);
+    } else if (output_grad_dims.size() == 4) {  // 2D interpolation grad
+      Interpolate2DCPUBwd<T>(ctx, input_grad, *output_grad);
+    } else if (output_grad_dims.size() == 5) {  // 3D interpolation grad
+      Interpolate3DCPUBwd<T>(ctx, input_grad, *output_grad);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72da43e3bc63c1c585fe19d703892c23ce7b0ec2
--- /dev/null
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/isfinite_v2_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace plat = paddle::platform;
+
+namespace paddle {
+namespace operators {
+
+class OverflowV2Op : public framework::OperatorWithKernel {
+ public:
+  OverflowV2Op(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "isfinitev2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "isfinitev2");
+    UnaryOpUnchangedInferShape(ctx);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    int dtype = -1;
+    auto *x_var = ctx.InputVar("X");
+    if (x_var->IsType<framework::LoDTensor>()) {
+      dtype = x_var->Get<framework::LoDTensor>().type();
+    } else if (x_var->IsType<framework::SelectedRows>()) {
+      dtype = x_var->Get<framework::SelectedRows>().value().type();
+    } else {
+      PADDLE_THROW(plat::errors::InvalidArgument(
+          "Cannot find the input data type by all input data"));
+    }
+    return framework::OpKernelType(framework::proto::VarType::Type(dtype),
+                                   ctx.GetPlace());
+  }
+};
+
+class OverflowV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input tensors of overflowv2 operator.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of overflowv2 operator. "
+              "Same size compare to input tensor");
+    AddComment(string::Sprintf(R"DOC(
+Overflow %s operator.
+
+$$Out = %s(X)$$
+
+Check whether each element of X is Inf or Nan, return the bool result of each
+element of X as a tensor.
+
+%s
+)DOC",
+                               GetName(), GetComments()));
+  }
+
+ protected:
+  virtual std::string GetName() const = 0;
+  virtual std::string GetComments() const = 0;
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+#define REGISTER_V2OP_MAKER(op_type, comment)                         \
+  namespace paddle {                                                  \
+  namespace operators {                                               \
+  class _##op_type##OverflowV2OpMaker                                 \
+      : public ::paddle::operators::OverflowV2OpMaker {               \
+   protected:                                                         \
+    std::string GetName() const { return #op_type; }                  \
+    std::string GetComments() const { return comment; }               \
+  };                                                                  \
+  }                                                                   \
+  }                                                                   \
+  REGISTER_OPERATOR(                                                  \
+      op_type, ops::OverflowV2Op, ops::_##op_type##OverflowV2OpMaker, \
+      paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, \
+      paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
+
+#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor)                       \
+  REGISTER_OP_CPU_KERNEL(                                                    \
+      op_type, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int,  \
+                                   ops::functor>,                            \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t,       \
+                          ops::functor>,                                     \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,         \
+                          ops::functor>,                                     \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,        \
+                          ops::functor>,                                     \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, plat::float16, \
+                          ops::functor>);
+
+REGISTER_V2OP_MAKER(isinf_v2, "isinfv2(X)");
+REGISTER_V2OP_MAKER(isnan_v2, "isnanv2(X)");
+REGISTER_V2OP_MAKER(isfinite_v2, "isfinitev2(X)");
+
+REGISTER_OVERFLOW_CPU_KERNEL(isinf_v2, InfinityV2Functor);
+REGISTER_OVERFLOW_CPU_KERNEL(isnan_v2, NANV2Functor);
+REGISTER_OVERFLOW_CPU_KERNEL(isfinite_v2, IsfiniteV2Functor);
diff --git a/paddle/fluid/operators/isfinite_v2_op.cu b/paddle/fluid/operators/isfinite_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4a6d818d0501e60dfffc8995075bb7f0369788fd
--- /dev/null
+++ b/paddle/fluid/operators/isfinite_v2_op.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/isfinite_v2_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+#define REGISTER_OVERFLOW_CUDA_KERNEL(op_type, functor)                       \
+  REGISTER_OP_CUDA_KERNEL(                                                    \
+      op_type, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,  \
+                                   ops::functor>,                             \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, int64_t,       \
+                          ops::functor>,                                      \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,         \
+                          ops::functor>,                                      \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,        \
+                          ops::functor>,                                      \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16, \
+                          ops::functor>);
+
+REGISTER_OVERFLOW_CUDA_KERNEL(isinf_v2, InfinityV2Functor);
+REGISTER_OVERFLOW_CUDA_KERNEL(isnan_v2, NANV2Functor);
+REGISTER_OVERFLOW_CUDA_KERNEL(isfinite_v2, IsfiniteV2Functor);
diff --git a/paddle/fluid/operators/isfinite_v2_op.h b/paddle/fluid/operators/isfinite_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f0aa63ce80248ee9f7839890f611b9d5293789e
--- /dev/null
+++ b/paddle/fluid/operators/isfinite_v2_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/isfinite_op.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+struct InfinityV2Functor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorContainsInfV2(tensor, out);
+  }
+};
+
+struct NANV2Functor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorContainsNANV2(tensor, out);
+  }
+};
+
+struct IsfiniteV2Functor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorIsfiniteV2(tensor, out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index 0a7146be83dcb673573f1fdcb94ed2d2c57bd2c3..2c3172d2a1112e2c79a3c1215ccd0d3f08d59451 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -53,11 +53,9 @@ class LinspaceOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
     return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Start"),
-        ctx.device_context(), layout_, library_);
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
   }
 };
 
@@ -73,6 +71,7 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Num",
              "Number of entry in the sequence. It is a tensor of shape [1], "
              "should be of type int32.");
+    AddAttr<int>("dtype", "The output data type.");
     AddOutput("Out", "A sequence of numbers.");
     AddComment(R"DOC(
     Return fixed number of evenly spaced values within a given interval. First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
@@ -85,4 +84,6 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker);
 REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel<float>,
+                       ops::CPULinspaceKernel<int32_t>,
+                       ops::CPULinspaceKernel<int64_t>,
                        ops::CPULinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
index 47d4536dcfe2a0ab43b3584196a138214e438e3e..8aca892a81d41b1e0a9f7f9c14169c2817ae9452 100644
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -20,13 +20,15 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
-  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+__global__ void LinspaceKernel(T start, double step, int64_t size, T* out) {
+  CUDA_KERNEL_LOOP(index, size) {
+    out[index] = static_cast<T>(start + step * index);
+  }
 }
 
 template <typename T>
 __global__ void LinspaceSpecialKernel(T start, T* out) {
-  out[0] = start;
+  out[0] = static_cast<T>(start);
 }
 
 template <typename T>
@@ -51,9 +53,9 @@ class CUDALinspaceKernel : public framework::OpKernel<T> {
     out->Resize(framework::make_ddim({num}));
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
-    T step = 0;
+    double step = 0;
     if (num != 1) {
-      step = (stop - start) / (num - 1);
+      step = (static_cast<double>(stop - start)) / (num - 1);
     }
 
     auto stream = context.cuda_device_context().stream();
@@ -68,4 +70,6 @@ class CUDALinspaceKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel<float>,
+                        ops::CUDALinspaceKernel<int32_t>,
+                        ops::CUDALinspaceKernel<int64_t>,
                         ops::CUDALinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
index b1fcac73b0ad249aa19859bde770a8554cdb7408..9fb4960375ed7be60598d558c65310bd4a4b84bc 100644
--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -35,14 +35,12 @@ class CPULinspaceKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
     if (num > 1) {
-      T step = (stop - start) / (num - 1);
-      T value = start;
+      double step = (static_cast<double>(stop - start)) / (num - 1);
       for (int i = 0; i < num; ++i) {
-        out_data[i] = value;
-        value += step;
+        out_data[i] = static_cast<T>(start + step * i);
       }
     } else {
-      out_data[0] = start;
+      out_data[0] = static_cast<T>(start);
     }
   }
 };
diff --git a/paddle/fluid/operators/masked_select_op.cc b/paddle/fluid/operators/masked_select_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b44c02757fae9648a7e660a06c03af45d621e02
--- /dev/null
+++ b/paddle/fluid/operators/masked_select_op.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/masked_select_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class MaskedSelectOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Input", "MaskedSelect");
+    OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect");
+    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Out", "MaskedSelect");
+    framework::DDim output_dims(ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Y", output_dims);
+    ctx->ShareLoD("X", /*->*/ "Y");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class MaskedSelectOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("Mask",
+             "The mask of Input Tensor to be selected which is a bool Tensor.");
+    AddOutput(
+        "Y",
+        "The returned tensor, the data type "
+        "is same as input, will be on the same device with the input Tensor.");
+    AddComment(R"DOC(
+Size Operator.
+
+Return a new 0-D tensor which indexes the indexed tensor according
+the mask which is a tensor withe data type bool.
+)DOC");
+  }
+};
+
+class MaskedSelectOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Input",
+                   "Input", "MaskedSelect");
+    OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Y")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class MaskedSelectGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("masked_select_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Mask", this->Input("Mask"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(MaskedSelectedGradNoNeedBufferVarsInferer,
+                                    "X");
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(masked_select, ops::MaskedSelectOp, ops::MaskedSelectOpMaker,
+                  ops::MaskedSelectGradOpMaker<paddle::framework::OpDesc>,
+                  ops::MaskedSelectGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(masked_select_grad, ops::MaskedSelectOpGrad,
+                  ops::MaskedSelectedGradNoNeedBufferVarsInferer);
+
+REGISTER_OP_CPU_KERNEL(
+    masked_select,
+    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    masked_select_grad,
+    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/masked_select_op.cu b/paddle/fluid/operators/masked_select_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7dc0516800c483d1d82a2390a64130e77b1efb01
--- /dev/null
+++ b/paddle/fluid/operators/masked_select_op.cu
@@ -0,0 +1,179 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/reverse.h>
+#include <thrust/scan.h>
+#include "paddle/fluid/operators/masked_select_op.h"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DDim = framework::DDim;
+
+__global__ void SetMaskArray(const bool* mask, int32_t* mask_array, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    if (mask[idx])
+      mask_array[idx] = 1;
+    else
+      mask_array[idx] = 0;
+  }
+}
+
+template <typename T>
+__global__ void SelectWithPrefixMask(const int32_t* mask_prefix_sum,
+                                     const bool* mask, const T* input, T* out,
+                                     int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    if (mask[idx]) {
+      int index = mask_prefix_sum[idx];
+      out[index] = input[idx];
+    }
+  }
+}
+
+template <typename T>
+__global__ void SelectGradWithPrefixMask(const int32_t* mask_prefix_sum,
+                                         const bool* mask, const T* input,
+                                         T* out, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    if (mask[idx]) {
+      int index = mask_prefix_sum[idx];
+      out[idx] = input[index];
+    } else {
+      out[idx] = 0;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class MaskedSelectCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto input = ctx.Input<framework::Tensor>("X");
+    auto mask = ctx.Input<framework::Tensor>("Mask");
+    auto out = ctx.Output<framework::Tensor>("Y");
+    auto* mask_data = mask->data<bool>();
+    auto input_data = input->data<T>();
+
+    auto mask_size = mask->numel();
+    auto input_dim = input->dims();
+    auto mask_dim = mask->dims();
+    PADDLE_ENFORCE_EQ(
+        input_dim, mask_dim,
+        platform::errors::InvalidArgument(
+            "The dim size of input and mask in OP(masked_selected) "
+            "must be equal, but got input dim:(%ld), mask dim: "
+            "(%ld). Please check input "
+            "value.",
+            input_dim, mask_dim));
+
+    thrust::device_ptr<const bool> mask_dev_ptr =
+        thrust::device_pointer_cast(mask_data);
+    thrust::device_vector<T> mask_vec(mask_dev_ptr, mask_dev_ptr + mask_size);
+    auto out_size = thrust::count(mask_vec.begin(), mask_vec.end(), true);
+
+    framework::DDim out_dim{out_size};
+    out->Resize(out_dim);
+    auto out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    Tensor mask_array;
+    Tensor mask_prefix_sum;
+    mask_array.Resize(mask_dim);
+    mask_prefix_sum.Resize(mask_dim);
+
+    int32_t* mask_array_data = mask_array.mutable_data<int32_t>(ctx.GetPlace());
+    int32_t* mask_prefix_sum_data =
+        mask_prefix_sum.mutable_data<int32_t>(ctx.GetPlace());
+    int threads = 512;
+    int grid = (mask_size + threads - 1) / threads;
+    auto stream = ctx.cuda_device_context().stream();
+    SetMaskArray<<<grid, threads, 0, stream>>>(mask_data, mask_array_data,
+                                               mask_size);
+
+    thrust::device_ptr<int32_t> mask_array_dev_ptr =
+        thrust::device_pointer_cast(mask_array_data);
+    thrust::device_vector<int32_t> mask_array_vec(
+        mask_array_dev_ptr, mask_array_dev_ptr + mask_size);
+    thrust::exclusive_scan(thrust::device, mask_array_vec.begin(),
+                           mask_array_vec.end(), mask_prefix_sum_data);
+
+    SelectWithPrefixMask<T><<<grid, threads, 0, stream>>>(
+        mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MaskedSelectGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto input = ctx.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto mask = ctx.Input<framework::Tensor>("Mask");
+    auto out = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* mask_data = mask->data<bool>();
+    auto* input_data = input->data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    auto input_size = input->numel();
+    auto mask_size = mask->numel();
+    auto mask_dim = mask->dims();
+
+    auto out_size = mask_size;
+
+    Tensor mask_array;
+    Tensor mask_prefix_sum;
+    mask_array.Resize(mask_dim);
+    mask_prefix_sum.Resize(mask_dim);
+
+    int32_t* mask_array_data = mask_array.mutable_data<int32_t>(ctx.GetPlace());
+    int32_t* mask_prefix_sum_data =
+        mask_prefix_sum.mutable_data<int32_t>(ctx.GetPlace());
+    int threads = 512;
+    int grid = (mask_size + threads - 1) / threads;
+    auto stream = ctx.cuda_device_context().stream();
+    SetMaskArray<<<grid, threads, 0, stream>>>(mask_data, mask_array_data,
+                                               mask_size);
+
+    thrust::device_ptr<int32_t> mask_array_dev_ptr =
+        thrust::device_pointer_cast(mask_array_data);
+    thrust::device_vector<int32_t> mask_array_vec(
+        mask_array_dev_ptr, mask_array_dev_ptr + mask_size);
+    thrust::exclusive_scan(thrust::device, mask_array_vec.begin(),
+                           mask_array_vec.end(), mask_prefix_sum_data);
+
+    SelectGradWithPrefixMask<T><<<grid, threads, 0, stream>>>(
+        mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    masked_select,
+    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    masked_select_grad,
+    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t>);
diff --git a/paddle/fluid/operators/masked_select_op.h b/paddle/fluid/operators/masked_select_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce8371556c82fe105b6719e845d4fd6232f3a95e
--- /dev/null
+++ b/paddle/fluid/operators/masked_select_op.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DDim = framework::DDim;
+
+template <typename DeviceContext, typename T>
+class MaskedSelectKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto input = context.Input<framework::Tensor>("X");
+    auto mask = context.Input<framework::Tensor>("Mask");
+    auto out = context.Output<framework::Tensor>("Y");
+    auto* mask_data = mask->data<bool>();
+    auto input_data = input->data<T>();
+
+    auto mask_size = mask->numel();
+
+    auto input_dim = input->dims();
+    auto mask_dim = mask->dims();
+    PADDLE_ENFORCE_EQ(
+        input_dim, mask_dim,
+        platform::errors::InvalidArgument(
+            "The dim size of input and mask in OP(masked_selected) "
+            "must be equal, but got input dim:(%ld), mask dim: "
+            "(%ld). Please check input "
+            "value.",
+            input_dim, mask_dim));
+
+    int out_size = 0;
+    for (int i = 0; i < mask_size; i++) {
+      if (mask_data[i]) out_size++;
+    }
+
+    framework::DDim out_dim{out_size};
+    out->Resize(out_dim);
+    auto out_data = out->mutable_data<T>(context.GetPlace());
+
+    int index = 0;
+    for (int i = 0; i < mask_size; i++) {
+      if (mask_data[i]) {
+        out_data[index] = input_data[i];
+        index++;
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MaskedSelectGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto out = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto mask = context.Input<framework::Tensor>("Mask");
+    auto input = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+
+    auto* mask_data = mask->data<bool>();
+    auto* input_data = input->data<T>();
+    auto* out_data = out->mutable_data<T>(context.GetPlace());
+    int mask_size = mask->numel();
+
+    int index = 0;
+    for (int i = 0; i < mask_size; i++) {
+      if (mask_data[i]) {
+        out_data[i] = input_data[index];
+        index++;
+      } else {
+        out_data[i] = 0;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index f8c971954fc4c0b367cc6e62df8f7a596b651b94..42a60e9220cf848ba766a19cb7b4d13edc460c11 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -198,6 +198,11 @@ class Blas {
                    int K, T alpha, const T* A, const T* B, T beta, T* C,
                    int batchCount, int64_t strideA, int64_t strideB) const;
 
+  template <typename T>
+  void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
+                   int K, T alpha, const T** A, const T** B, T beta, T** C,
+                   int batchCount) const;
+
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
   template <typename T>
   void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB,
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 64b35cfeaecd1f88395db97d0374d919356651eb..d0c5f74d4efb8248b41d8b2af285e8dd7ec4d479 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -458,6 +458,17 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
 #endif  // CUDA_VERSION >= 9010
 }
 
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    T alpha, const T **A, const T **B, T beta, T **C, int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
+                           C[k]);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index cdaf53fea30085b34f07c37d50455c9b02dc5c44..892bf15738141bfbb7e75fa6b37c0cda53a8e098 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include <algorithm>
 #include <cmath>
 #include <limits>
 #include <vector>
@@ -655,6 +656,26 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM(
 #endif
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    T alpha, const T **A, const T **B, T beta, T **C, int batchCount) const {
+#ifdef PADDLE_WITH_MKLML
+  const int lda = std::max((transA == CblasNoTrans) ? K : M, 1);
+  const int ldb = std::max((transB == CblasNoTrans) ? N : K, 1);
+  const int ldc = std::max(N, 1);
+  CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha, A,
+                       &lda, B, &ldb, &beta, C, &ldc, 1 /* group_count */,
+                       &batchCount);
+#else
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
+                           C[k]);
+  }
+#endif
+}
+
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
 template <>
 template <typename T>
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 44b04104419e790b0ca8619b85ec0a1b4d701021..6748d0ab43f70f997b3008f34f4be743b81e8946 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -73,6 +73,13 @@ struct TensorSetConstantCPU {
   float value_;
 };
 
+template <>
+void set_constant_with_place<platform::XPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index b967dd2cfda80358b863dfcd986d4ad7104a9ac0..22164131468a46bffc239509a9213a21f1611ed5 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -111,12 +111,11 @@ __global__ void KernelPool2DGrad(
     int phstart, phend;
     int pwstart, pwend;
     if (adaptive) {
-      phstart = h_offset * output_height / input_height;
-      phend =
-          min((h_offset + 1) * output_height / input_height + 1, output_height);
-      pwstart = w_offset * output_width / input_width;
-      pwend =
-          min((w_offset + 1) * output_width / input_width + 1, output_width);
+      phstart = AdaptStartIndex(h_offset, output_height, input_height);
+      phend = AdaptEndIndex(h_offset, output_height, input_height);
+
+      pwstart = AdaptStartIndex(w_offset, output_width, input_width);
+      pwend = AdaptEndIndex(w_offset, output_width, input_width);
     } else {
       phstart = (h_offset < ksize_height)
                     ? 0
@@ -159,6 +158,7 @@ __global__ void KernelPool2DGrad(
           pool_size = exclusive ? (hend - hstart) * (wend - wstart)
                                 : ksize_height * ksize_width;
         }
+
         int output_sub_idx = channel_last
                                  ? (ph * output_width + pw) * channels + offsetC
                                  : ph * output_width + pw;
@@ -689,15 +689,14 @@ __global__ void KernelPool3DGrad(
     int phstart, phend;
     int pwstart, pwend;
     if (adaptive) {
-      pdstart = d_offset * output_depth / input_depth;
-      pdend =
-          min((d_offset + 1) * output_depth / input_depth + 1, output_depth);
-      phstart = h_offset * output_height / input_height;
-      phend =
-          min((h_offset + 1) * output_height / input_height + 1, output_height);
-      pwstart = w_offset * output_width / input_width;
-      pwend =
-          min((w_offset + 1) * output_width / input_width + 1, output_width);
+      pdstart = AdaptStartIndex(d_offset, output_depth, input_depth);
+      pdend = AdaptEndIndex(d_offset, output_depth, input_depth);
+
+      phstart = AdaptStartIndex(h_offset, output_height, input_height);
+      phend = AdaptEndIndex(h_offset, output_height, input_height);
+
+      pwstart = AdaptStartIndex(w_offset, output_width, input_width);
+      pwend = AdaptEndIndex(w_offset, output_width, input_width);
     } else {
       pdstart = (d_offset < ksize_depth)
                     ? 0
diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
index 238d9f2905058d267ffbee0669594920d7a9e031..a4bdc923eecc3e1283e642e592f91d658e7c9aa7 100644
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
@@ -13,12 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sampler.h"
+
 #include <glog/logging.h>
+
 #include <iostream>
 #include <queue>
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/generator.h"
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -27,7 +31,7 @@ Sampler::~Sampler() {}
 
 UniformSampler::UniformSampler(int64_t range, unsigned int seed)
     : Sampler(range, seed), inv_range_(1.0 / (range + 1)) {
-  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
+  random_engine_ = framework::GetCPURandomEngine(seed_);
   dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
 }
 
@@ -37,7 +41,7 @@ float UniformSampler::Probability(int64_t value) const { return inv_range_; }
 
 LogUniformSampler::LogUniformSampler(int64_t range, unsigned int seed)
     : Sampler(range, seed), log_range_(log(range + 1)) {
-  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
+  random_engine_ = framework::GetCPURandomEngine(seed_);
   dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
 }
 
@@ -46,8 +50,8 @@ int64_t LogUniformSampler::Sample() const {
   // inverse_transform_sampling method
   // More details:
   // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/
-  const int64_t value =
-      static_cast<int64_t>(exp((*dist_)(*random_engine_) * log_range_)) - 1;
+  auto cur_random = (*dist_)(*random_engine_);
+  const int64_t value = static_cast<int64_t>(exp(cur_random * log_range_)) - 1;
   // Mathematically, value should be <= range_, but might not be due to some
   // floating point roundoff, so we mod by range_.
   return value % range_;
@@ -65,7 +69,7 @@ CustomSampler::CustomSampler(int64_t range, const float *probabilities,
                              const int *alias, const float *alias_probabilities,
                              unsigned int seed)
     : Sampler(range, seed) {
-  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  random_engine_ = framework::GetCPURandomEngine(seed_);
   real_dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
   int_dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
 
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index 3fa5a7ae336a9be984324411b88570aea99c2c78..480576ef9dc8c21811a1a867d553ccc6d97fa22a 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -26,8 +26,8 @@ namespace math {
 // TODO(wanghaoshuang): Support for GPU
 
 /**
-* Sample integers from [0, range).
-*/
+ * Sample integers from [0, range).
+ */
 class Sampler {
  public:
   explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
@@ -117,7 +117,7 @@ class CustomSampler : public Sampler {
   const int* alias_;
   const float* probs_;
   const int exceptional_val = -1;
-  std::shared_ptr<std::mt19937> random_engine_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
   std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
   std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
 };
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0254ad0a563d91282e76cd7bf43343e4d9139842
--- /dev/null
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -0,0 +1,176 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/matmul_v2_op.h"
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class MatMulV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matmul_v2");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "matmul_v2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matmul_v2");
+    bool trans_x = ctx->Attrs().Get<bool>("trans_x");
+    bool trans_y = ctx->Attrs().Get<bool>("trans_y");
+
+    std::vector<int64_t> dims_x =
+        paddle::framework::vectorize(ctx->GetInputDim("X"));
+    std::vector<int64_t> dims_y =
+        paddle::framework::vectorize(ctx->GetInputDim("Y"));
+    auto ndims_x = dims_x.size();
+    auto ndims_y = dims_y.size();
+
+    bool x_broadcasted = false, y_broadcasted = false;
+    if (ndims_x == 1) {
+      dims_x.insert(dims_x.begin(), 1);
+      ndims_x = 2;
+      x_broadcasted = true;
+    }
+
+    if (ndims_y == 1) {
+      dims_y.push_back(1);
+      ndims_y = 2;
+      y_broadcasted = true;
+    }
+
+    size_t M, N;
+    if (trans_x) {
+      M = dims_x[ndims_x - 1];
+    } else {
+      M = dims_x[ndims_x - 2];
+    }
+    if (trans_y) {
+      N = dims_y[ndims_y - 2];
+    } else {
+      N = dims_y[ndims_y - 1];
+    }
+
+    std::vector<int64_t> new_dims;
+    if (ndims_x >= ndims_y) {
+      new_dims.assign(dims_x.begin(), dims_x.end() - 2);
+    } else {
+      new_dims.assign(dims_y.begin(), dims_y.end() - 2);
+    }
+    if (!x_broadcasted) {
+      new_dims.push_back(M);
+    }
+    if (!y_broadcasted) {
+      new_dims.push_back(N);
+    }
+    if (x_broadcasted && y_broadcasted) {
+      new_dims.push_back(1);
+    }
+
+    auto out_dims = framework::make_ddim(new_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /* --> */ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "tensor of shape (d0, d1 ... M, K)");
+    AddInput("Y", "tensor of shape (d0, d1 ... K, N)");
+    AddOutput("Out", "tensor of shape (d0, d1 ... M, N)");
+    AddAttr<bool>("trans_x",
+                  "Set true to transpose the last two dimensions of X before "
+                  "doing multiplication")
+        .SetDefault(false);
+    AddAttr<bool>("trans_y",
+                  "Set true to transpose the last two dimensions of Y before "
+                  "doing multiplication")
+        .SetDefault(false);
+    AddComment(
+        R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K), 
+        B has shape (d0, d1 ... K, N), Out has shape ((d0, d1 ... M, N)). 
+        In addition, it also follows the broadcast rule which is similar as
+        numpy.matmul.
+)DOC");
+  }
+};
+
+class MatMulV2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "matmul_v2");
+    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "matmul_v2");
+    OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "matmul_v2");
+    auto x_dims = context->GetInputDim("X");
+    auto y_dims = context->GetInputDim("Y");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+
+    if (context->HasOutput(x_grad_name)) {
+      context->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (context->HasOutput(y_grad_name)) {
+      context->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+template <typename T>
+class MatMulV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("matmul_v2_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Y", this->Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(matmul_v2, ops::MatMulV2Op, ops::MatMulV2OpMaker,
+                  ops::MatMulV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::MatMulV2GradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    matmul_v2, ops::MatMulV2Kernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulV2Kernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    matmul_v2_grad,
+    ops::MatMulV2GradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulV2GradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/matmul_v2_op.cu b/paddle/fluid/operators/matmul_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..64ec65a23419725c7cc481beadb9383402a426bd
--- /dev/null
+++ b/paddle/fluid/operators/matmul_v2_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/matmul_v2_op.h"
+
+namespace ops = paddle::operators;
+namespace plf = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(matmul_v2,
+                        ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
+                        ops::MatMulV2Kernel<plf::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    matmul_v2_grad, ops::MatMulV2GradKernel<plf::CUDADeviceContext, float>,
+    ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc83e4d964815ec46452bb0086cf17437b3846a4
--- /dev/null
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -0,0 +1,481 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/dot_op.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
+
+#ifdef __NVCC__
+#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+struct IdentityFunctor {
+  HOSTDEVICE explicit inline IdentityFunctor() {}
+
+  HOSTDEVICE inline T operator()(const T& x) const { return x; }
+};
+
+template <typename DeviceContext, typename T>
+void ReduceSumForMatmulGrad(const Tensor* input, Tensor* output,
+                            const std::vector<int>& reduce_dims,
+                            const paddle::framework::ExecutionContext& ctx) {
+  if (reduce_dims.empty()) {
+    // FIXME maybe reduce this copy operation
+    framework::TensorCopySync(*input, ctx.GetPlace(), output);
+    return;
+  }
+#ifdef __NVCC__
+  auto stream = ctx.cuda_device_context().stream();
+  TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
+      *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
+      IdentityFunctor<T>(), stream);
+#else
+  ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
+      input, output, reduce_dims, true, false, ctx)
+      .template apply<T>();
+#endif
+}
+
+static void GetBroadcastFromDims(const int x_ndim, const std::int64_t* x_dims,
+                                 const int y_ndim, const std::int64_t* y_dims,
+                                 std::int64_t* x_bd_dims,
+                                 std::int64_t* y_bd_dims,
+                                 std::int64_t* out_bd_dims) {
+  const int ndim = std::max(x_ndim, y_ndim);
+  std::fill(x_bd_dims, x_bd_dims + ndim - x_ndim, 1);
+  std::fill(y_bd_dims, y_bd_dims + ndim - y_ndim, 1);
+  std::copy(x_dims, x_dims + x_ndim, x_bd_dims + ndim - x_ndim);
+  std::copy(y_dims, y_dims + y_ndim, y_bd_dims + ndim - y_ndim);
+
+  for (int i = 0; i < ndim; ++i) {
+    PADDLE_ENFORCE_EQ(
+        x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] <= 1 || y_bd_dims[i] <= 1,
+        true, platform::errors::InvalidArgument(
+                  "Input(X) and Input(Y) has error dim."));
+    if (x_bd_dims[i] == 0 || y_bd_dims[i] == 0) {
+      out_bd_dims[i] = 0;
+    } else {
+      out_bd_dims[i] = std::max(x_bd_dims[i], y_bd_dims[i]);
+    }
+  }
+}
+
+static int64_t GetIndexMessage(const int n, const int64_t* dims,
+                               const int64_t* index) {
+  int64_t sum = 0;
+  for (int i = 0; i < n; ++i) {
+    if (dims[i] > 1) {
+      sum = sum * dims[i] + index[i];
+    }
+  }
+  return sum;
+}
+
+static void IndexIncreaseFromDims(const int ndim, const int64_t* dims,
+                                  int64_t* index) {
+  for (int i = ndim - 1; i >= 0; --i) {
+    ++index[i];
+    if (index[i] >= dims[i]) {
+      index[i] -= dims[i];
+    } else {
+      break;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+void MatMulFunction(const Tensor* X, const Tensor* Y,
+                    const std::vector<std::int64_t>& x_dims,
+                    const std::vector<std::int64_t>& y_dims, Tensor* Out,
+                    bool trans_x, bool trans_y,
+                    const paddle::framework::ExecutionContext& ctx) {
+  const int x_ndim = x_dims.size();
+  const int y_ndim = y_dims.size();
+
+  // get data ptr
+  const T* x_data = X->data<T>();
+  const T* y_data = Y->data<T>();
+
+  if (x_ndim == 1 && y_ndim == 1) {
+    PADDLE_ENFORCE_EQ(X->numel(), Y->numel(),
+                      platform::errors::InvalidArgument(
+                          "X's numbers is not equal to Y's numbers,"
+                          "when X/Y's dims =1"));
+    VLOG(3) << "MatMul's case 1";
+    Out->Resize({1});
+    Out->mutable_data<T>(ctx.GetPlace());
+    auto out_eigen = framework::EigenScalar<T>::From(*Out);
+    auto x_eigen = framework::EigenVector<T>::Flatten(*X);
+    auto y_eigen = framework::EigenVector<T>::Flatten(*Y);
+
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    out_eigen.device(dev) = (x_eigen * y_eigen).sum();
+    return;
+  }
+
+  auto& dev_ctx = ctx.template device_context<DeviceContext>();
+  auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+
+  if (x_ndim == 1) {
+    const int N = X->numel();
+    if (trans_y) {
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 1], N,
+          platform::errors::InvalidArgument("Input(Y) has error dim."));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 2], N,
+          platform::errors::InvalidArgument("Input(Y) has error dim."));
+    }
+    std::vector<std::int64_t> out_dims(y_ndim - 1);
+    if (trans_y) {
+      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
+    } else {
+      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
+      out_dims.back() = y_dims.back();
+    }
+    Out->Resize(framework::make_ddim(out_dims));
+    Out->mutable_data<T>(ctx.GetPlace());
+    if (trans_y) {
+      const int M = Y->numel() / N;
+      VLOG(3) << "MatMul's case 2";
+      blas.GEMV(false, M, N, 1., y_data, x_data, 0., Out->data<T>());
+    } else {
+      const int M = y_dims[y_ndim - 1];
+      const int batch_size = Y->numel() / (M * N);
+      if (batch_size == 1) {
+        VLOG(3) << "MatMul's case 3";
+        blas.GEMV(true, N, M, 1., y_data, x_data, 0., Out->data<T>());
+      } else {
+        VLOG(3) << "MatMul's case 4";
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, y_data,
+                         x_data, 0, Out->data<T>(), batch_size, M * N, 0);
+      }
+    }
+    return;
+  }
+
+  if (y_ndim == 1) {
+    const int N = Y->numel();
+    if (trans_x) {
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 2], N,
+          platform::errors::InvalidArgument("Input(X) has error dim."));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 1], N,
+          platform::errors::InvalidArgument("Input(X) has error dim."));
+    }
+    std::vector<std::int64_t> out_dims(x_ndim - 1);
+    if (trans_x) {
+      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
+      out_dims.back() = x_dims.back();
+    } else {
+      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
+    }
+    Out->Resize(framework::make_ddim(out_dims));
+    Out->mutable_data<T>(ctx.GetPlace());
+
+    if (trans_x) {
+      const int M = x_dims[x_ndim - 1];
+      const int batch_size = X->numel() / (M * N);
+      if (batch_size == 1) {
+        VLOG(3) << "MatMul's case 5";
+        blas.GEMV(true, N, M, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+      } else {
+        VLOG(3) << "MatMul's case 6";
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, x_data,
+                         y_data, 0, Out->data<T>(), batch_size, M * N, 0);
+      }
+    } else {
+      const int M = X->numel() / N;
+      VLOG(3) << "MatMul's case 7";
+      blas.GEMV(false, M, N, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+    }
+    return;
+  }
+
+  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
+  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+  if (trans_y) {
+    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K, platform::errors::InvalidArgument(
+                                                 "Input(X) has error dim."));
+  } else {
+    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K, platform::errors::InvalidArgument(
+                                                 "Input(X) has error dim."));
+  }
+  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
+  const int ndim = std::max(x_ndim, y_ndim);
+  std::vector<std::int64_t> x_broadcast_dims(ndim);
+  std::vector<std::int64_t> y_broadcast_dims(ndim);
+  std::vector<std::int64_t> out_broadcast_dims(ndim);
+
+  GetBroadcastFromDims(x_ndim - 2, x_dims.data(), y_ndim - 2, y_dims.data(),
+                       x_broadcast_dims.data(), y_broadcast_dims.data(),
+                       out_broadcast_dims.data());
+
+  out_broadcast_dims[ndim - 2] = M;
+  out_broadcast_dims[ndim - 1] = N;
+
+  Out->Resize(framework::make_ddim(out_broadcast_dims));
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  const int batch_dim = ndim - 2;
+  // broadcast message
+  const bool is_broadcast_dims = !std::equal(
+      x_broadcast_dims.cbegin(), x_broadcast_dims.cbegin() + batch_dim,
+      y_broadcast_dims.cbegin());
+
+  const std::int64_t x_batch_size = std::accumulate(
+      x_broadcast_dims.cbegin(), x_broadcast_dims.cbegin() + batch_dim, 1LL,
+      std::multiplies<std::int64_t>());
+  const std::int64_t y_batch_size = std::accumulate(
+      y_broadcast_dims.cbegin(), y_broadcast_dims.cbegin() + batch_dim, 1LL,
+      std::multiplies<std::int64_t>());
+  const std::int64_t out_batch_size = std::accumulate(
+      out_broadcast_dims.cbegin(), out_broadcast_dims.cbegin() + batch_dim, 1LL,
+      std::multiplies<std::int64_t>());
+  if (out_batch_size == 0) return;
+  if (x_batch_size == 1 && y_batch_size == 1) {
+    VLOG(3) << "MatMul's case 8";
+    blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
+              trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
+              y_data, 0.0f, Out->data<T>());
+  } else if (x_batch_size == 1) {
+    if (M == 1 && trans_y) {
+      VLOG(3) << "MatMul's case 9";
+      blas.GEMV(false, y_batch_size * N, K, 1.0f, y_data, x_data, 0.0f,
+                Out->data<T>());
+    } else {
+      VLOG(3) << "MatMul's case 10";
+      blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                       trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
+                       x_data, y_data, 0, Out->data<T>(), out_batch_size, 0,
+                       K * N);
+    }
+  } else if (y_batch_size == 1) {
+    if (!trans_x) {
+      VLOG(3) << "MatMul's case 11";
+      blas.GEMM(CblasNoTrans, trans_y ? CblasTrans : CblasNoTrans,
+                x_batch_size * M, N, K, 1.0f, x_data, y_data, 0.0f,
+                Out->data<T>());
+    } else {
+      VLOG(3) << "MatMul's case 12";
+      blas.BatchedGEMM(CblasTrans, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
+                       1.0f, x_data, y_data, 0, Out->data<T>(), out_batch_size,
+                       M * K, 0);
+    }
+  } else if (!is_broadcast_dims) {
+    VLOG(3) << "MatMul's case 13";
+    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
+                     y_data, 0, Out->data<T>(), out_batch_size, M * K, K * N);
+  } else {
+    // in the case, can't use stridedgemm
+    std::vector<const T*> x_ptr(out_batch_size);
+    std::vector<const T*> y_ptr(out_batch_size);
+    std::vector<T*> out_ptr(out_batch_size);
+    std::vector<std::int64_t> index(batch_dim, 0);
+    for (std::int64_t i = 0; i < out_batch_size; ++i) {
+      // using the index to get offset
+      const std::int64_t x_index =
+          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
+      const std::int64_t y_index =
+          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
+
+      x_ptr[i] = x_data + x_index * M * K;
+      y_ptr[i] = y_data + y_index * K * N;
+      out_ptr[i] = Out->data<T>() + i * M * N;
+      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
+    }
+    VLOG(3) << "MatMul's case 14";
+    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
+                     x_ptr.data(), y_ptr.data(), 0.0f, out_ptr.data(),
+                     out_batch_size);
+  }
+}
+
+template <typename DeviceContext, typename T>
+void MatMulFunction(const Tensor* X, const Tensor* Y, Tensor* Out, bool trans_x,
+                    bool trans_y,
+                    const paddle::framework::ExecutionContext& ctx) {
+  const std::vector<std::int64_t> x_dims = vectorize(X->dims());
+  const std::vector<std::int64_t> y_dims = vectorize(Y->dims());
+  MatMulFunction<DeviceContext, T>(X, Y, x_dims, y_dims, Out, trans_x, trans_y,
+                                   ctx);
+}
+
+template <typename DeviceContext, typename T>
+class MatMulV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* Out = ctx.Output<Tensor>("Out");
+    bool trans_x = ctx.Attr<bool>("trans_x");
+    bool trans_y = ctx.Attr<bool>("trans_y");
+    MatMulFunction<DeviceContext, T>(X, Y, Out, trans_x, trans_y, ctx);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MatMulV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    bool trans_x = ctx.Attr<bool>("trans_x");
+    bool trans_y = ctx.Attr<bool>("trans_y");
+
+    // get dims
+    std::vector<std::int64_t> x_dims = vectorize(X->dims());
+    std::vector<std::int64_t> y_dims = vectorize(Y->dims());
+    std::vector<std::int64_t> dout_dims = vectorize(dOut->dims());
+
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int ndim = dout_dims.size();
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    // x's or y's dim = 1
+    if (x_ndim == 1 && y_ndim == 1) {
+      if (dx) dx->mutable_data<T>(ctx.GetPlace());
+      if (dy) dy->mutable_data<T>(ctx.GetPlace());
+      if (dOut->numel() == 1) {
+        DotGradFunction<DeviceContext, T>(X, Y, dOut, dx, dy, ctx);
+        return;
+      }
+    }
+    // It is very tricky. For this broadcast, currently using the reduce sum to
+    // get gradient.
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin() + 0, 1);
+      x_ndim += 1;
+      if (trans_x)
+        dout_dims.push_back(1);
+      else
+        dout_dims.insert(dout_dims.begin() + ndim - 1, 1);
+      ndim += 1;
+    }
+
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      y_ndim += 1;
+      if (trans_y)
+        dout_dims.insert(dout_dims.begin() + ndim - 1, 1);
+      else
+        dout_dims.push_back(1);
+      ndim += 1;
+    }
+
+    // the normal case
+    Tensor dx_help, dy_help;
+    if (trans_x) {
+      if (trans_y) {
+        // X'Y': dA = Y'G', dB = G'X'
+        if (dx)
+          MatMulFunction<DeviceContext, T>(Y, dOut, y_dims, dout_dims, &dx_help,
+                                           true, true, ctx);
+        if (dy)
+          MatMulFunction<DeviceContext, T>(dOut, X, dout_dims, x_dims, &dy_help,
+                                           true, true, ctx);
+      } else {
+        // X'Y: dX = YG', dY = XG
+        if (dx)
+          MatMulFunction<DeviceContext, T>(Y, dOut, y_dims, dout_dims, &dx_help,
+                                           false, true, ctx);
+        if (dy)
+          MatMulFunction<DeviceContext, T>(X, dOut, x_dims, dout_dims, &dy_help,
+                                           false, false, ctx);
+      }
+    } else {
+      if (trans_y) {
+        // XY': dX = GY, dY = G'X
+        if (dx)
+          MatMulFunction<DeviceContext, T>(dOut, Y, dout_dims, y_dims, &dx_help,
+                                           false, false, ctx);
+        if (dy)
+          MatMulFunction<DeviceContext, T>(dOut, X, dout_dims, x_dims, &dy_help,
+                                           true, false, ctx);
+      } else {
+        // XY: dX = GY', dY = X'G
+        if (dx)
+          MatMulFunction<DeviceContext, T>(dOut, Y, dout_dims, y_dims, &dx_help,
+                                           false, true, ctx);
+        if (dy)
+          MatMulFunction<DeviceContext, T>(X, dOut, x_dims, dout_dims, &dy_help,
+                                           true, false, ctx);
+      }
+    }
+    // get help dims
+    const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
+    const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
+
+    std::vector<std::int64_t> dx_broadcast_dims(ndim);
+    std::vector<std::int64_t> dy_broadcast_dims(ndim);
+
+    std::fill(dx_broadcast_dims.data(),
+              dx_broadcast_dims.data() + ndim - x_ndim, 1);
+    std::fill(dy_broadcast_dims.data(),
+              dy_broadcast_dims.data() + ndim - y_ndim, 1);
+    std::copy(x_dims.data(), x_dims.data() + x_ndim,
+              dx_broadcast_dims.data() + ndim - x_ndim);
+    std::copy(y_dims.data(), y_dims.data() + y_ndim,
+              dy_broadcast_dims.data() + ndim - y_ndim);
+
+    std::vector<int> dx_reduce_dims;
+    std::vector<int> dy_reduce_dims;
+    for (int idx = 0; idx <= ndim - 3; idx++) {
+      if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
+        dx_reduce_dims.push_back(idx);
+      }
+      if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
+        dy_reduce_dims.push_back(idx);
+      }
+    }
+    // reduce sum to get grad by ReduceSum
+    if (dx) {
+      dx->Resize(dx_help.dims());
+      ReduceSumForMatmulGrad<DeviceContext, T>(&dx_help, dx, dx_reduce_dims,
+                                               ctx);
+      dx->Resize(X->dims());
+    }
+    if (dy) {
+      dy->Resize(dy_help.dims());
+      ReduceSumForMatmulGrad<DeviceContext, T>(&dy_help, dy, dy_reduce_dims,
+                                               ctx);
+      dy->Resize(Y->dims());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 17e1e1958346155af32cf75b5e9fc25cdbdd91eb..7d99bb7d2b7a7049c67788df4c507afc14880815 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -72,7 +72,7 @@ static mkldnn::memory::data_type GetDstType(bool is_int8,
   return dst_dt;
 }
 
-template <typename T>
+template <typename T, typename K, typename T_out>
 class ConvMKLDNNHandlerT
     : public platform::MKLDNNHandlerT<T, mkldnn::convolution_forward> {
  public:
@@ -227,7 +227,7 @@ class ConvMKLDNNHandlerT
           platform::MKLDNNMemDesc(weights_tz, platform::MKLDNNGetDataType<T>(),
                                   MKLDNNMemoryFormat::any);
       const auto dst_md = platform::MKLDNNMemDesc(
-          dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+          dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
 
       const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
                                          : mkldnn::prop_kind::forward_training;
@@ -313,29 +313,29 @@ class ConvMKLDNNHandlerT
     if (is_test && weights_mem_p) {
       return weights_mem_p;
     } else {
-      const T* filter_data = filter->data<T>();
+      const K* filter_data = filter->data<K>();
       auto weights_tz = framework::vectorize(filter->dims());
       GetWeightsTz(weights_tz, groups);
 
       auto user_src_md = platform::MKLDNNMemDesc(
-          weights_tz, platform::MKLDNNGetDataType<T>(),
+          weights_tz, platform::MKLDNNGetDataType<K>(),
           GetWeightsFormat(filter->format(), groups, is_conv3d));
 
       return this->AcquireMemoryWithReorder(
           user_src_md, this->fwd_pd_->weights_desc(),
-          to_void_cast<T>(filter_data), "@weights_mem_p", is_test);
+          to_void_cast<K>(filter_data), "@weights_mem_p", is_test);
     }
   }
 
   std::shared_ptr<mkldnn::memory> AcquireBiasMemoryWithReorder(
       const framework::Tensor* bias, const bool is_test) {
-    const T* bias_data = bias->data<T>();
+    const K* bias_data = bias->data<K>();
     auto user_bias_md = platform::MKLDNNMemDesc(
-        framework::vectorize(bias->dims()), platform::MKLDNNGetDataType<T>(),
+        framework::vectorize(bias->dims()), platform::MKLDNNGetDataType<K>(),
         MKLDNNMemoryFormat::x);
 
     return this->AcquireMemoryWithReorder(
-        user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast<T>(bias_data),
+        user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast<K>(bias_data),
         "@bias_mem_p", is_test);
   }
 
@@ -358,14 +358,14 @@ class ConvMKLDNNHandlerT
     if (residual_param->format() !=
         platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc())) {
       auto residual_memory_p = this->AcquireResidualMemory(residual_param);
-      dst_memory_p = this->AcquireDstMemory(output);
+      dst_memory_p = this->template AcquireDstMemory<T_out>(output);
       this->AcquireReorder(residual_memory_p, dst_memory_p, "@residual_dst");
     } else {
       // Changing ShareDataWith to TensorCopy results in performance drop
       // on ResNet architectures
       // (https://github.com/PaddlePaddle/Paddle/issues/22964)
       output->ShareDataWith(*residual_param);
-      dst_memory_p = this->AcquireDstMemory(output);
+      dst_memory_p = this->template AcquireDstMemory<T_out>(output);
     }
     return dst_memory_p;
   }
@@ -381,7 +381,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     bool is_INT8 =
         std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
     if (!is_INT8) {
-      ComputeFP32(ctx);
+      ComputeFP32<float>(ctx);
     } else {
       std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
       bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
@@ -399,6 +399,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     }
   }
 
+  template <typename T_out>
   void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
@@ -414,7 +415,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     auto* output = ctx.Output<Tensor>("Output");
 
-    ConvMKLDNNHandlerT<T> handler(
+    ConvMKLDNNHandlerT<T, K, T_out> handler(
         ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias,
         output, ctx.InputName("Input") + ctx.InputName("Filter"));
 
@@ -429,7 +430,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       dst_memory_p =
           handler.AcquireDstMemoryWithResidual(output, residual_param);
     } else {
-      dst_memory_p = handler.AcquireDstMemory(output);
+      dst_memory_p = handler.template AcquireDstMemory<T_out>(output);
     }
 
     auto conv_p = handler.AcquireForwardPrimitive();
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index 86c1c3232644a1fed236563a65a16bc2f6466d49..540642c7140e707441ad9c4d71ae9b777863a7bd 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -51,11 +51,11 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
     mkldnn::memory::data_type src_dt =
         paddle::framework::ToMKLDNNDataType(input->type());
     MKLDNNMemoryFormat src_fmt = input->format();
-    std::string key =
-        platform::CreateKey(src_dt, src_tz, ctx.OutputName("Output"));
-    const std::string key_prim = key + "@reorder_p";
-    const std::string key_src_mem = key + "@src_mem";
-    const std::string key_dst_mem = key + "@dst_mem";
+    std::string key = platform::CreateKey(platform::ThreadIDasStr(), src_dt,
+                                          src_tz, ctx.OutputName("Output"));
+    const std::string key_prim = key + "@r";
+    const std::string key_src_mem = key + "@s";
+    const std::string key_dst_mem = key + "@d";
 
     std::shared_ptr<mkldnn::memory> src_memory;
     std::shared_ptr<mkldnn::memory> dst_memory;
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index a50cc22e5bb0def54b057dcc23d2f6751eecc478..40737f4cd029b47dbd03069a2e4d29ad33121eb9 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -44,6 +44,7 @@ class FCPrimitiveFactory {
 
   void ExecuteFcPrimitive(const LoDTensor* input, const Tensor* weights,
                           const Tensor* bias, LoDTensor* output,
+                          const MKLDNNDeviceContext& dev_ctx,
                           const ExecutionContext& ctx) {
     RecomputeOutputDims(ctx, input, weights, output);
     // If primitive has already been created and cached, don't create new one,
@@ -74,8 +75,8 @@ class FCPrimitiveFactory {
               "input format is equal to ncw."));
     }
 
-    // Transform weights to default MKL-DNN format
-    weights_ = TransposeWeights(weights);
+    weights_ = CreateWeightsMemory(weights);
+
     // Since MKL-DNN has a lot of limitations on what the input/weights/output
     // dimensions should be, to simplify the code, the creation of primitive
     // descriptor has been divided into separate cases, based on the number
@@ -112,10 +113,13 @@ class FCPrimitiveFactory {
     // Quantize weights and reorder to format chosen by FC primitive descriptor.
     QuantizeWeights(ctx, fc_prim_desc->weights_desc());
 
-    bias_ = CreateMemory<float>(fc_prim_desc->bias_desc(), bias);
+    bias_ = CreateMemoryToBeCached<float>(fc_prim_desc->bias_desc(), bias);
     // If int8 is desired, quantize bias into 32-bit signed int
     QuantizeBias(*fc_prim_desc, ctx);
 
+    // Store weights and bias in the mkldnn cache
+    CacheWeightsAndBias(dev_ctx, ctx);
+
     // Based on format determined by inner_product, create output in desired
     // memory format
     output_ = CreateDstMemory(*fc_prim_desc, ctx, output);
@@ -262,14 +266,15 @@ class FCPrimitiveFactory {
   }
 
   // Convert data from one data format to another
-  mkldnn::memory Reorder(const memory::desc& src_desc,
-                         const memory::desc& dst_desc, void* src_data) {
+  std::shared_ptr<mkldnn::memory> Reorder(const memory::desc& src_desc,
+                                          const memory::desc& dst_desc,
+                                          void* src_data) {
     auto src_mem = memory(src_desc, engine_, src_data);
-    auto dst_mem = memory(dst_desc, engine_);
+    auto dst_mem = std::make_shared<memory>(dst_desc, engine_);
 
-    auto reorder = mkldnn::reorder(src_mem, dst_mem);
+    auto reorder = mkldnn::reorder(src_mem, *dst_mem);
     mkldnn::stream astream(engine_);
-    reorder.execute(astream, src_mem, dst_mem);
+    reorder.execute(astream, src_mem, *dst_mem);
     astream.wait();
 
     return dst_mem;
@@ -277,9 +282,10 @@ class FCPrimitiveFactory {
 
   // Convert data from one data format to another and rescale it.
   // If the desired data type is (un)signed int8, quantization occurs here.
-  mkldnn::memory Reorder(const memory& src_mem, const memory::desc& dst_md,
-                         const std::vector<float>& scale_data) {
-    mkldnn::memory dst_mem = mkldnn::memory(dst_md, engine_);
+  std::shared_ptr<mkldnn::memory> ReorderWithScale(
+      const std::shared_ptr<memory> src_mem, const memory::desc& dst_md,
+      const std::vector<float>& scale_data) {
+    auto dst_mem = std::make_shared<mkldnn::memory>(dst_md, engine_);
     mkldnn::primitive_attr attributes;
     // According to MKL-DNN's documentation mask determines along which
     // dimensions should the scale be applied.
@@ -289,11 +295,11 @@ class FCPrimitiveFactory {
     //     becuase we perform per-output-channel quantization
     int mask = CreateMask(0, scale_data.size() > 1);
     attributes.set_output_scales(mask, scale_data);
-    auto reorder = mkldnn::reorder(src_mem, dst_mem, attributes);
+    auto reorder = mkldnn::reorder(*src_mem, *dst_mem, attributes);
 
     mkldnn::stream astream(engine_);
     reorder.execute(astream,
-                    {{MKLDNN_ARG_FROM, src_mem}, {MKLDNN_ARG_TO, dst_mem}});
+                    {{MKLDNN_ARG_FROM, *src_mem}, {MKLDNN_ARG_TO, *dst_mem}});
     astream.wait();
 
     return dst_mem;
@@ -323,16 +329,38 @@ class FCPrimitiveFactory {
     return memory(desc, engine_, data);
   }
 
-  // Transpose weights through MKL-DNN's reorder from io to oi format.
-  mkldnn::memory TransposeWeights(const Tensor* weights) {
+  template <typename T>
+  std::shared_ptr<mkldnn::memory> CreateMemoryToBeCached(
+      const mkldnn::memory::desc& desc, const Tensor* tensor) {
+    return CreateMemoryToBeCached(desc,
+                                  platform::to_void_cast<T>(tensor->data<T>()));
+  }
+
+  std::shared_ptr<mkldnn::memory> CreateMemoryToBeCached(
+      const mkldnn::memory::desc& desc, void* data) {
+    return std::make_shared<memory>(desc, engine_, data);
+  }
+
+  // Create weights memory and transform to default MKL-DNN format
+  std::shared_ptr<mkldnn::memory> CreateWeightsMemory(const Tensor* weights) {
     auto dims = framework::vectorize(weights->dims());
     std::swap(dims[0], dims[1]);  // Correct output dimensions
     auto src_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::io);
     auto dst_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oi);
+    // Transpose weights through MKL-DNN's reorder from io to oi format.
     return Reorder(src_desc, dst_desc,
                    platform::to_void_cast<float>(weights->data<float>()));
   }
 
+  void CacheWeightsAndBias(const MKLDNNDeviceContext& dev_ctx,
+                           const ExecutionContext& ctx) {
+    const std::string key = platform::CreateKey(platform::ThreadIDasStr());
+    const std::string weights_key = key + ctx.InputName("W");
+    const std::string bias_key = key + ctx.InputName("Bias");
+    dev_ctx.SetBlob(weights_key, weights_);
+    dev_ctx.SetBlob(bias_key, bias_);
+  }
+
   // Compute the bias scales so that its values correspond to the
   // scale of data being an output of weights and input multiplication
   std::vector<float> ComputeBiasScales(const ExecutionContext& ctx) {
@@ -388,14 +416,14 @@ class FCPrimitiveFactory {
   }
 
   void QuantizeWeights(const ExecutionContext& ctx, memory::desc dst) {
-    weights_ =
-        Reorder(*weights_, dst, ctx.Attr<std::vector<float>>("Scale_weights"));
+    weights_ = ReorderWithScale(weights_, dst,
+                                ctx.Attr<std::vector<float>>("Scale_weights"));
   }
 
   void QuantizeBias(const inner_product_forward::primitive_desc& fc_prim_desc,
                     const ExecutionContext& ctx) {
     auto bias_scales = ComputeBiasScales(ctx);
-    bias_ = Reorder(*bias_, fc_prim_desc.bias_desc(), bias_scales);
+    bias_ = ReorderWithScale(bias_, fc_prim_desc.bias_desc(), bias_scales);
   }
 
   // Fuse relu into FC with activation type attribute has been set to 'relu'
@@ -463,10 +491,10 @@ class FCPrimitiveFactory {
 
  private:
   const mkldnn::engine& engine_;
-  boost::optional<memory> bias_;
   boost::optional<memory> input_;
   boost::optional<memory> output_;
-  boost::optional<memory> weights_;
+  std::shared_ptr<memory> bias_;
+  std::shared_ptr<memory> weights_;
   boost::optional<inner_product_forward> fc_;
 };
 
@@ -476,19 +504,13 @@ class FCPrimitiveFactory {
 template <typename T_in, typename T_w, typename T_out>
 static std::shared_ptr<FCPrimitiveFactory<T_in, T_w, T_out>>
 GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx,
-                    const ExecutionContext& ctx, const Tensor* input,
-                    const Tensor* weights,
-                    const mkldnn::engine& mkldnn_engine) {
-  const std::string key = platform::CreateKey(
-      platform::ThreadIDasStr(), input->format(), input->dims()[0],
-      framework::vectorize<int>(weights->dims()), ctx.OutputName("Out"));
-
+                    const std::string& key) {
   auto prim_creator =
       std::static_pointer_cast<FCPrimitiveFactory<T_in, T_w, T_out>>(
           dev_ctx.GetBlob(key));
   if (prim_creator == nullptr) {
-    prim_creator =
-        std::make_shared<FCPrimitiveFactory<T_in, T_w, T_out>>(mkldnn_engine);
+    prim_creator = std::make_shared<FCPrimitiveFactory<T_in, T_w, T_out>>(
+        dev_ctx.GetEngine());
     dev_ctx.SetBlob(key, prim_creator);
   }
 
@@ -498,24 +520,24 @@ GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx,
 // Choose appropriate primitive factory implementation based on inferred
 // output type (uint8, int8 or float).
 template <typename T_in, typename T_w>
-static void ExecuteFc(const MKLDNNDeviceContext& dev_ctx,
-                      const ExecutionContext& ctx, const LoDTensor* input,
+static void ExecuteFc(const ExecutionContext& ctx, const LoDTensor* input,
                       const Tensor* w, const Tensor* bias, LoDTensor* output,
-                      const mkldnn::engine& mkldnn_engine, bool fuse_relu,
-                      bool force_fp32_output) {
+                      bool fuse_relu, bool force_fp32_output) {
+  auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const std::string prim_key = platform::CreateKey(
+      platform::ThreadIDasStr(), input->format(), input->dims()[0],
+      framework::vectorize<int>(w->dims()), ctx.OutputName("Out"));
   constexpr bool is_int8 =
       std::is_same<T_in, int8_t>::value || std::is_same<T_in, uint8_t>::value;
   if (!is_int8 || force_fp32_output) {
-    GetPrimitiveFactory<T_in, T_w, float>(dev_ctx, ctx, input, w, mkldnn_engine)
-        ->ExecuteFcPrimitive(input, w, bias, output, ctx);
+    GetPrimitiveFactory<T_in, T_w, float>(dev_ctx, prim_key)
+        ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
   } else if (fuse_relu) {
-    GetPrimitiveFactory<T_in, T_w, uint8_t>(dev_ctx, ctx, input, w,
-                                            mkldnn_engine)
-        ->ExecuteFcPrimitive(input, w, bias, output, ctx);
+    GetPrimitiveFactory<T_in, T_w, uint8_t>(dev_ctx, prim_key)
+        ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
   } else {
-    GetPrimitiveFactory<T_in, T_w, int8_t>(dev_ctx, ctx, input, w,
-                                           mkldnn_engine)
-        ->ExecuteFcPrimitive(input, w, bias, output, ctx);
+    GetPrimitiveFactory<T_in, T_w, int8_t>(dev_ctx, prim_key)
+        ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
   }
 }
 
@@ -526,9 +548,6 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T_in> {
     PADDLE_ENFORCE_EQ(
         platform::is_cpu_place(ctx.GetPlace()), true,
         platform::errors::PreconditionNotMet("FC MKL-DNN must use CPUPlace."));
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
     auto input = ctx.Input<LoDTensor>("Input");
     auto w = ctx.Input<Tensor>("W");
     auto bias = ctx.Input<Tensor>("Bias");
@@ -537,8 +556,8 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T_in> {
     bool fuse_relu = ctx.Attr<std::string>("activation_type") == "relu";
     bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
 
-    ExecuteFc<T_in, T_w>(dev_ctx, ctx, input, w, bias, output, mkldnn_engine,
-                         fuse_relu, force_fp32_output);
+    ExecuteFc<T_in, T_w>(ctx, input, w, bias, output, fuse_relu,
+                         force_fp32_output);
 
     output->set_layout(DataLayout::kMKLDNN);
   }
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index 37b6e3bb803a2b68cec54059b266bd7585ff9958..98200caca8cf66960632b88966f23e99fcd4c299 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 #include "paddle/fluid/operators/mean_op.h"
 
@@ -28,21 +30,17 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
     float std = context.Attr<float>("std");
     auto* tensor = context.Output<framework::Tensor>("Out");
 
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
-    std::normal_distribution<T> dist(mean, std);
-
     const std::string op_type = "gaussian_random";
     auto shape = GetShape(context, op_type);
     tensor->Resize(shape);
     T* data = tensor->mutable_data<T>(context.GetPlace());
     int64_t size = tensor->numel();
+    std::normal_distribution<T> dist(mean, std);
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
+
     for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
+      data[i] = dist(*engine);
     }
 
     tensor->set_layout(DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 55bd683f8f4283287e1bd67810170bd4082379a6..29a86a35d7b26f41745907fb6bacf30506c027a0 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -48,11 +48,12 @@ class QuantOpKernel : public framework::OpKernel<T> {
     const T* input_data = input->data<T>();
 
     bool is_negative = ctx.Attr<bool>("is_negative_input");
-    std::string key = platform::CreateKey(src_tz, scale_data, is_negative,
-                                          ctx.OutputName("Output"));
-    const std::string key_prim = key + "@reorder_p";
-    const std::string key_src_mem = key + "@src_mem";
-    const std::string key_dst_mem = key + "@dst_mem";
+    std::string key =
+        platform::CreateKey(platform::ThreadIDasStr(), src_tz, scale_data,
+                            is_negative, ctx.OutputName("Output"));
+    const std::string key_prim = key + "@r";
+    const std::string key_src_mem = key + "@s";
+    const std::string key_dst_mem = key + "@d";
 
     std::shared_ptr<mkldnn::memory> src_memory;
     std::shared_ptr<mkldnn::memory> dst_memory;
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
index 92e7744e3c0a459f3267f4210d42752b5ec0bcc0..5ad5ad9450503111882a9b3bc2cd9161f74d500e 100644
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
@@ -40,11 +40,12 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
 
     auto src_tz = paddle::framework::vectorize(input->dims());
 
-    std::string key = platform::CreateKey(src_tz, scale_in, scale_out,
-                                          ctx.OutputName("Output"));
-    const std::string key_prim = key + "@reorder_p";
-    const std::string key_src_mem = key + "@src_mem";
-    const std::string key_dst_mem = key + "@dst_mem";
+    std::string key =
+        platform::CreateKey(platform::ThreadIDasStr(), src_tz, scale_in,
+                            scale_out, ctx.OutputName("Output"));
+    const std::string key_prim = key + "@r";
+    const std::string key_src_mem = key + "@s";
+    const std::string key_dst_mem = key + "@d";
 
     std::shared_ptr<dnnl::memory> src_memory;
     std::shared_ptr<dnnl::memory> dst_memory;
diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
index e99ccd31714787306358d9b19b31a62ff21d5dab..f0b5f4a466a0049c53d51d8610cf115d8bfe0295 100644
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
@@ -55,8 +55,8 @@ class NLLLossOp : public framework::OperatorWithKernel {
                               "Input(Weight) should be a 1D tensor."));
         PADDLE_ENFORCE_EQ(x_dims[1], w_dims[0],
                           platform::errors::InvalidArgument(
-                              "Input(Weight) Tensor's size should match"
-                              "to the class numer."));
+                              "Input(Weight) Tensor's size should match "
+                              "to the the total number of classes."));
       }
     }
     if (x_dims.size() == 2) {
diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h
index 92f3d169f3f6a3be1009d84ebd87c82691eb9f0c..e93d5792205900635093e5f18d715e4607f73cda 100644
--- a/paddle/fluid/operators/nll_loss_op.h
+++ b/paddle/fluid/operators/nll_loss_op.h
@@ -91,7 +91,7 @@ static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data,
           }
           PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
                             platform::errors::InvalidArgument(
-                                "label should nor be out of bounds."));
+                                "label should not be out of bounds."));
           const auto cur_weight =
               weight_data ? weight_data[cur_label] : static_cast<T>(1);
           out_data[index] = -x_data[i * sample_size + cur_label * map_size +
@@ -117,7 +117,7 @@ static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data,
         }
         PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
                           platform::errors::InvalidArgument(
-                              "label should nor be out of bounds."));
+                              "label should not be out of bounds."));
         const auto cur_weight =
             weight_data ? weight_data[cur_label] : static_cast<T>(1);
         total_weight_val += cur_weight;
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
index 92ce600f22b64f82a053233dbd130adefca964fa..7f0b2b7d064ed12875577fee2265ab17c1fce08f 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
@@ -25,15 +25,11 @@ class DGCMomentumOp : public MomentumOp {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("current_step"), true,
-                      "current_step should be set.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true,
-                      platform::errors::NotFound(
-                          "Input(nranks) of DGCMomentumOp is not found."));
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Grad_out"), true,
-                      platform::errors::NotFound(
-                          "Output(Grad_out) of DGCMomentumOp is not found."));
+    OP_INOUT_CHECK(ctx->HasInput("current_step"), "Input", "current_step",
+                   "DGCMomentumOp");
+    OP_INOUT_CHECK(ctx->HasInput("nranks"), "Input", "nranks", "DGCMomentumOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Grad_out"), "Output", "Grad_out",
+                   "DGCMomentumOp");
     return MomentumOp::InferShape(ctx);
   }
 
diff --git a/paddle/fluid/operators/p_norm_op.cc b/paddle/fluid/operators/p_norm_op.cc
index 057a7a38e3f40fdeb400418740dab825f532054c..59035d5a8ca5d4214f1370e1b14b2be9b234fa6a 100644
--- a/paddle/fluid/operators/p_norm_op.cc
+++ b/paddle/fluid/operators/p_norm_op.cc
@@ -25,34 +25,54 @@ class PnormOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "(Tensor) A tensor of rank >= axis.");
     AddAttr<float>("porder",
-                   "The porder is the p order vector norm to calculate.")
+                   "(float, default 2) The porder is the p order vector norm "
+                   "to calculate. Available for porder=0, inf, -inf and any "
+                   "real number.")
         .SetDefault(2.0f);
     AddAttr<int>("axis",
-                 "The axis on which to apply normalization. If axis < 0, "
+                 "The axis on which to apply norm operation. If axis < 0, "
                  "the dimension to pnorm is rank(X) + axis. -1 is "
                  "the last dimension.")
         .SetDefault(-1);
     AddAttr<float>("epsilon",
-                   "(float, default 1e-10) The epsilon value is used "
+                   "(float, default 1e-12) The epsilon value is used "
                    "to avoid division by zero.")
         .SetDefault(1.0e-12f);
     AddAttr<bool>(
         "keepdim",
-        "(bool, default false) Whether to keep the dimensions as the input")
+        "(bool, default false) Whether to keep the dimensions as the input.")
         .SetDefault(false);
-    AddOutput(
-        "Out",
-        "(Tensor) Output tensor for the `(sum(x.pow(p)) + epsion).pow(1/p)`");
+
+    AddAttr<bool>("asvector",
+                  "(bool, default false) as vector norm when axis is None and "
+                  "input is matrix, ")
+        .SetDefault(false);
+    AddOutput("Out", "(Tensor) Output result tensor of p-norm");
     AddComment(R"DOC(
+Pnorm Operator.
+Given a tensor X, compute Lp-norm of X.
 
-Given a tensor, apply 2-normalization along the provided axis.
+When p = 0, defining $0^0 = 0$, the zero-norm of X is simply the number of non-zero elements of X.
+$$
+||X||_{0} = \lim_{p \rightarrow 0} \sum_i |x_i|^p
+$$
 
+When p = inf, the inf-norm of X is the maximum element of X.
 $$
-pnorm = \(\sum_i {abs\(x_i\)^p}  \)^{1/p}
+||X||_\infty = \max_i |x_i|
 $$
 
-where, $\sum_i{x_i^p}$ is calculated along the `axis` dimension.
-        
+When p = -inf, the negative-inf-norm of X is the minimum element of X.
+$$
+||X||_{-\infty} = \min_i |x_i|
+$$
+
+Otherwise, the p-norm of X follows the formula,
+$$
+||X||_{p} = (\sum_i |x_i|^p)^{1/p}
+$$
+where, $\sum_i $ is calculated along the `axis` dimension.
+
 )DOC");
   }
 };
@@ -63,31 +83,38 @@ class PnormOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "p_norm");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "p_norm");
-    auto porder = ctx->Attrs().Get<float>("porder");
-    PADDLE_ENFORCE_NE(porder, INFINITY,
-                      platform::errors::Unimplemented(
-                          "The input porder of p_norm is not support for "
-                          "porder == 0, INFINITY, -INFINITY now."));
-    PADDLE_ENFORCE_NE(porder, -INFINITY,
-                      platform::errors::Unimplemented(
-                          "The input porder of p_norm is not support for "
-                          "porder == 0, INFINITY, -INFINITY now."));
-    PADDLE_ENFORCE_GT(porder, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The input porder of p_norm is not support for "
-                          "porder <= 0, But received porder=%f.",
-                          porder));
-    auto xdim = ctx->GetInputDim("X");
+    auto x_dim = ctx->GetInputDim("X");
+    auto x_rank = x_dim.size();
     int axis = ctx->Attrs().Get<int>("axis");
     bool keepdim = ctx->Attrs().Get<bool>("keepdim");
-    if (axis < 0) axis = xdim.size() + axis;
+
+    PADDLE_ENFORCE_GE(axis, -x_rank,
+                      platform::errors::InvalidArgument(
+                          "Attr(axis) value should be in range [-R, R-1], R is "
+                          "the rank of Input(X). But received axis: %d, R: %d. "
+                          "Current Input(X)'s shape is=[%s].",
+                          axis, x_rank, x_dim));
+    PADDLE_ENFORCE_LT(axis, x_rank,
+                      platform::errors::InvalidArgument(
+                          "Attr(axis) value should be in range [-R, R-1], R is "
+                          "the rank of Input(X). But received axis: %d, R: %d. "
+                          "Current Input(X)'s shape is=[%s].",
+                          axis, x_rank, x_dim));
+
     std::vector<int> reduce_dims;
-    for (int i = 0; i < xdim.size(); ++i) {
-      if (i != axis) reduce_dims.emplace_back(xdim[i]);
+    bool asvector = ctx->Attrs().Get<bool>("asvector");
+    if (asvector) {
+      reduce_dims.emplace_back(1);
+    } else {
+      if (axis < 0) axis = x_dim.size() + axis;
+      for (int i = 0; i < x_dim.size(); ++i) {
+        if (i != axis) reduce_dims.emplace_back(x_dim[i]);
+      }
     }
-    xdim[axis] = 1;
+    x_dim[axis] = 1;
+
     if (keepdim) {
-      ctx->SetOutputDim("Out", xdim);
+      ctx->SetOutputDim("Out", x_dim);
     } else {
       ctx->SetOutputDim("Out", framework::make_ddim(reduce_dims));
     }
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index d9ac98ff880bcf42e0af5bb75b080464c5211671..ba0d46f4c73ec2683e51722033713c5cb3736643 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -49,20 +49,70 @@ __global__ void Pnorm(const T* x, const int pre,
 
   for (int i = blockIdx.x; i < num; i += gridDim.x) {
     int base = (i / post) * post * axis_n + (i % post);
-
     T sum = 0.0;
-    __shared__ T norm;
     for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
       const T x_ij = x[base + j * post];
       sum += inline_pow(inline_abs(x_ij), porder_t);
     }
     T reduce_result = BlockReduce(temp_storage).Sum(sum);
+    if (threadIdx.x == 0) out_norm[i] = inline_pow(reduce_result, porder_inv);
+  }
+}
 
-    if (threadIdx.x == 0) {
-      norm = inline_pow(reduce_result, porder_inv);
-      out_norm[i] = norm;
+template <typename T, int BlockDim>
+__global__ void ZeorNorm(const T* x, const int pre,
+                         const int axis_n,  // dim in axis
+                         const int post, T* out_norm) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    int base = (i / post) * post * axis_n + (i % post);
+    T sum = 0.0;
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      const T x_ij = x[base + j * post];
+      sum += static_cast<T>(x_ij != 0);
     }
-    __syncthreads();
+    T reduce_result = BlockReduce(temp_storage).Sum(sum);
+    if (threadIdx.x == 0) out_norm[i] = reduce_result;
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void InfNorm(const T* x, const int pre,
+                        const int axis_n,  // dim in axis
+                        const int post, T* out_norm) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    int base = (i / post) * post * axis_n + (i % post);
+    T cur_max = inline_abs(x[base]);
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      T x_ij_abs = inline_abs(x[base + j * post]);
+      if (cur_max < x_ij_abs) cur_max = x_ij_abs;
+    }
+    T reduce_result = BlockReduce(temp_storage).Reduce(cur_max, cub::Max());
+    if (threadIdx.x == 0) out_norm[i] = reduce_result;
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void NegInfNorm(const T* x, const int pre,
+                           const int axis_n,  // dim in axis
+                           const int post, T* out_norm) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    int base = (i / post) * post * axis_n + (i % post);
+    T cur_min = inline_abs(x[base]);
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      T x_ij_abs = inline_abs(x[base + j * post]);
+      if (cur_min > x_ij_abs) cur_min = x_ij_abs;
+    }
+    T reduce_result = BlockReduce(temp_storage).Reduce(cur_min, cub::Min());
+    if (threadIdx.x == 0) out_norm[i] = reduce_result;
   }
 }
 
@@ -79,9 +129,10 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
     auto ndim = out_norm->dims();
     float porder = ctx.Attr<float>("porder");
     int axis = ctx.Attr<int>("axis");
+    bool asvector = ctx.Attr<bool>("asvector");
     if (axis < 0) axis = xdim.size() + axis;
     int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
+    GetDims(xdim, axis, &pre, &n, &post, asvector);
 
     auto& dev_ctx = ctx.cuda_device_context();
 
@@ -89,8 +140,19 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
-    Pnorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
-                                                          porder, norm);
+    if (porder == 0) {
+      ZeorNorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
+                                                               norm);
+    } else if (porder == INFINITY) {
+      InfNorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
+                                                              norm);
+    } else if (porder == -INFINITY) {
+      NegInfNorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n,
+                                                                 post, norm);
+    } else {
+      Pnorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
+                                                            porder, norm);
+    }
   }
 };
 
@@ -112,7 +174,6 @@ __global__ void PnormGradient(const T* x, const T* x_norm, const T* y_grad,
       pnorm_i = x_norm[i];
       yout_i = y_grad[i];
     }
-
     __syncthreads();
 
     for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
@@ -125,6 +186,33 @@ __global__ void PnormGradient(const T* x, const T* x_norm, const T* y_grad,
   }
 }
 
+template <typename T, int BlockDim>
+__global__ void InfNormGradient(const T* x, const T* x_norm, const T* y_grad,
+                                const int pre, const int axis_n, const int post,
+                                T* x_grad) {
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    __shared__ T pnorm_i;
+    __shared__ T yout_i;
+    auto base = (i / post) * post * axis_n + (i % post);
+    if (threadIdx.x == 0) {
+      pnorm_i = x_norm[i];
+      yout_i = y_grad[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      int index = base + j * post;
+      const T x_ij = inline_abs(x[index]);
+      if (x_ij == pnorm_i) {
+        x_grad[index] = inline_sign(x[index]) * yout_i;
+      } else {
+        x_grad[index] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
 template <typename DeviceContext, typename T, typename AttrType = T>
 class PnormGradCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -143,9 +231,10 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
     float porder = ctx.Attr<float>("porder");
     T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
     int axis = ctx.Attr<int>("axis");
+    bool asvector = ctx.Attr<bool>("asvector");
     if (axis < 0) axis = xdim.size() + axis;
     int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
+    GetDims(xdim, axis, &pre, &n, &post, asvector);
 
     auto& dev_ctx = ctx.cuda_device_context();
 
@@ -153,8 +242,17 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
-    PnormGradient<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-        x, x_norm, norm_dy, porder, pre, n, post, eps, dx);
+    if (porder == 0) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      set_zero(dev_ctx, out_dx, static_cast<T>(0));
+    } else if (porder == INFINITY || porder == -INFINITY) {
+      InfNormGradient<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+          x, x_norm, norm_dy, pre, n, post, dx);
+    } else {
+      PnormGradient<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+          x, x_norm, norm_dy, porder, pre, n, post, eps, dx);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/p_norm_op.h b/paddle/fluid/operators/p_norm_op.h
index c5bdfe352723b55f80376d6644922af5de099e90..8fca6924a2541d052bb2ebce0225ba5522ff6fd5 100644
--- a/paddle/fluid/operators/p_norm_op.h
+++ b/paddle/fluid/operators/p_norm_op.h
@@ -20,15 +20,19 @@ namespace paddle {
 namespace operators {
 
 inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
-                    int* post) {
+                    int* post, bool asvector) {
   *pre = 1;
   *post = 1;
   *n = dim[axis];
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= dim[i];
-  }
-  for (int i = axis + 1; i < dim.size(); ++i) {
-    (*post) *= dim[i];
+  if (asvector) {
+    *n = product(dim);
+  } else {
+    for (int i = 0; i < axis; ++i) {
+      (*pre) *= dim[i];
+    }
+    for (int i = axis + 1; i < dim.size(); ++i) {
+      (*post) *= dim[i];
+    }
   }
 }
 
@@ -43,9 +47,10 @@ class PnormKernel : public framework::OpKernel<T> {
     auto xdim = in_x->dims();
     float porder = ctx.Attr<float>("porder");
     int axis = ctx.Attr<int>("axis");
+    bool asvector = ctx.Attr<bool>("asvector");
     if (axis < 0) axis = xdim.size() + axis;
     int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
+    GetDims(xdim, axis, &pre, &n, &post, asvector);
 
     auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
@@ -58,10 +63,20 @@ class PnormKernel : public framework::OpKernel<T> {
     auto x = x_e.reshape(shape);
     auto norm = norm_e.reshape(norm_shape);
 
+    // p=0 means number of non-zero elements of (x)
+    // p=inf means the maximum of |x|
+    // p=-inf means the minimum of |x|
+    // otherwise, Lp-norm = pow(sum(pow(|x|, p)), 1/p)
     Eigen::DSizes<int, 1> rdim(1);
-    auto xp = (x.abs()).pow(porder);
-    auto sum = xp.sum(rdim);
-    norm.device(*place) = sum.pow(1.0f / porder);
+    if (porder == 0) {
+      norm.device(*place) = (x != x.constant(0)).template cast<T>().sum(rdim);
+    } else if (porder == INFINITY) {
+      norm.device(*place) = x.abs().maximum(rdim);
+    } else if (porder == -INFINITY) {
+      norm.device(*place) = x.abs().minimum(rdim);
+    } else {
+      norm.device(*place) = x.abs().pow(porder).sum(rdim).pow(1.0f / porder);
+    }
   }
 };
 
@@ -81,9 +96,10 @@ class PnormGradKernel : public framework::OpKernel<T> {
     float porder = ctx.Attr<float>("porder");
 
     int axis = ctx.Attr<int>("axis");
+    bool asvector = ctx.Attr<bool>("asvector");
     if (axis < 0) axis = xdim.size() + axis;
     int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
+    GetDims(xdim, axis, &pre, &n, &post, asvector);
     Eigen::DSizes<int, 3> shape(pre, n, post);
     Eigen::DSizes<int, 3> rshape(pre, 1, post);
 
@@ -102,10 +118,20 @@ class PnormGradKernel : public framework::OpKernel<T> {
     Eigen::DSizes<int, 1> rdim(1);
     Eigen::DSizes<int, 3> bcast(1, n, 1);
 
-    dx.device(*place) = (x.abs()).pow(porder - 1.0f);
-    dx.device(*place) =
-        dx / ((norm.broadcast(bcast)).pow(porder - 1.0f) + x.constant(eps));
-    dx.device(*place) = dx * norm_dy.broadcast(bcast) * x.sign();
+    if (porder == 0) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      set_zero(dev_ctx, out_dx, static_cast<T>(0));
+    } else if (porder == INFINITY || porder == -INFINITY) {
+      dx.device(*place) =
+          (x.abs() == norm.broadcast(bcast)).template cast<T>() * x.sign() *
+          norm_dy.broadcast(bcast);
+    } else {
+      dx.device(*place) =
+          (x.abs()).pow(porder - 1.0f) /
+          ((norm.broadcast(bcast)).pow(porder - 1.0f) + x.constant(eps));
+      dx.device(*place) = dx * norm_dy.broadcast(bcast) * x.sign();
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d41b823b6551647803ae5641f72955dbbc1eb62
--- /dev/null
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -0,0 +1,912 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+void ConstPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
+                         const int in_height, const int in_width,
+                         const int out_depth, const int out_height,
+                         const int out_width, const int pad_front,
+                         const int pad_top, const int pad_left, const int out_d,
+                         const int out_h, const int out_w, const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+       in_h >= in_height || in_w >= in_width)
+          ? value
+          : in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ConstPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
+                         const int in_depth, const int in_height,
+                         const int in_width, const int out_depth,
+                         const int out_height, const int out_width,
+                         const int pad_front, const int pad_top,
+                         const int pad_left, const int out_d, const int out_h,
+                         const int out_w, const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  if (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+      in_h >= in_height || in_w >= in_width) {
+    for (int c = 0; c < channels; ++c) {
+      out_data[out_index + c] = value;
+    }
+  } else {
+    const int in_index =
+        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+    for (int c = 0; c < channels; ++c) {
+      out_data[out_index + c] = in_data[in_index + c];
+    }
+  }
+}
+
+template <typename T>
+void ReflectPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
+                           const int in_height, const int in_width,
+                           const int out_depth, const int out_height,
+                           const int out_width, const int pad_front,
+                           const int pad_top, const int pad_left,
+                           const int out_d, const int out_h, const int out_w,
+                           const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);                     // reflect by 0
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+  in_h = std::max(in_h, -in_h);                     // reflect by 0
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+  in_w = std::max(in_w, -in_w);                     // reflect by 0
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ReflectPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
+                           const int in_depth, const int in_height,
+                           const int in_width, const int out_depth,
+                           const int out_height, const int out_width,
+                           const int pad_front, const int pad_top,
+                           const int pad_left, const int out_d, const int out_h,
+                           const int out_w, const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
+  in_h = std::max(in_h, -in_h);
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);
+  in_w = std::max(in_w, -in_w);
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void ReplicatePad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
+                             const int in_height, const int in_width,
+                             const int out_depth, const int out_height,
+                             const int out_width, const int pad_front,
+                             const int pad_top, const int pad_left,
+                             const int out_d, const int out_h, const int out_w,
+                             const T value) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ReplicatePad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
+                             const int in_depth, const int in_height,
+                             const int in_width, const int out_depth,
+                             const int out_height, const int out_width,
+                             const int pad_front, const int pad_top,
+                             const int pad_left, const int out_d,
+                             const int out_h, const int out_w, const T value) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void CircularPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
+                            const int in_height, const int in_width,
+                            const int out_depth, const int out_height,
+                            const int out_width, const int pad_front,
+                            const int pad_top, const int pad_left,
+                            const int out_d, const int out_h, const int out_w,
+                            const T value) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void CircularPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
+                            const int in_depth, const int in_height,
+                            const int in_width, const int out_depth,
+                            const int out_height, const int out_width,
+                            const int pad_front, const int pad_top,
+                            const int pad_left, const int out_d,
+                            const int out_h, const int out_w, const T value) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void Pad3DNCDHW(const T* in_data, const int num, const int channels,
+                const int in_depth, const int in_height, const int in_width,
+                const int out_depth, const int out_height, const int out_width,
+                const int pad_front, const int pad_top, const int pad_left,
+                T value, T* out_data,
+                void (*pad_func)(const T*, T*, const int, const int, const int,
+                                 const int, const int, const int, const int,
+                                 const int, const int, const int, const int,
+                                 const int, const T)) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_d = 0; out_d < out_depth; ++out_d) {
+        for (int out_h = 0; out_h < out_height; ++out_h) {
+          for (int out_w = 0; out_w < out_width; ++out_w) {
+            pad_func(in_data, out_data, in_depth, in_height, in_width,
+                     out_depth, out_height, out_width, pad_front, pad_top,
+                     pad_left, out_d, out_h, out_w, value);
+          }
+        }
+      }
+      in_data += in_depth * in_height * in_width;
+      out_data += out_depth * out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad3DNDHWC(const T* in_data, const int num, const int channels,
+                const int in_depth, const int in_height, const int in_width,
+                const int out_depth, const int out_height, const int out_width,
+                const int pad_front, const int pad_top, const int pad_left,
+                T value, T* out_data,
+                void (*pad_func)(const T*, T*, const int, const int, const int,
+                                 const int, const int, const int, const int,
+                                 const int, const int, const int, const int,
+                                 const int, const int, const T)) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_d = 0; out_d < out_depth; ++out_d) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          pad_func(in_data, out_data, channels, in_depth, in_height, in_width,
+                   out_depth, out_height, out_width, pad_front, pad_top,
+                   pad_left, out_d, out_h, out_w, value);
+        }
+      }
+    }
+    in_data += in_depth * in_height * in_width * channels;
+    out_data += out_depth * out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void ConstPad3DGradNCDHW(T* d_in_data, const T* d_out_data, const int in_depth,
+                         const int in_height, const int in_width,
+                         const int out_depth, const int out_height,
+                         const int out_width, const int pad_front,
+                         const int pad_top, const int pad_left, const int out_d,
+                         const int out_h, const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+        in_h >= in_height || in_w >= in_width)) {
+    d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] =
+        d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+  }
+}
+
+template <typename T>
+void ConstPad3DGradNDHWC(T* d_in_data, const T* d_out_data, const int channels,
+                         const int in_depth, const int in_height,
+                         const int in_width, const int out_depth,
+                         const int out_height, const int out_width,
+                         const int pad_front, const int pad_top,
+                         const int pad_left, const int out_d, const int out_h,
+                         const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+        in_h >= in_height || in_w >= in_width)) {
+    const int in_index =
+        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+    for (int c = 0; c < channels; ++c) {
+      d_in_data[in_index + c] = d_out_data[out_index + c];
+    }
+  }
+}
+
+template <typename T>
+void ReflectPad3DGradNCDHW(T* d_in_data, const T* d_out_data,
+                           const int in_depth, const int in_height,
+                           const int in_width, const int out_depth,
+                           const int out_height, const int out_width,
+                           const int pad_front, const int pad_top,
+                           const int pad_left, const int out_d, const int out_h,
+                           const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);                     // reflect by 0
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+  in_h = std::max(in_h, -in_h);                     // reflect by 0
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+  in_w = std::max(in_w, -in_w);                     // reflect by 0
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void ReflectPad3DGradNDHWC(T* d_in_data, const T* d_out_data,
+                           const int channels, const int in_depth,
+                           const int in_height, const int in_width,
+                           const int out_depth, const int out_height,
+                           const int out_width, const int pad_front,
+                           const int pad_top, const int pad_left,
+                           const int out_d, const int out_h, const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
+  in_h = std::max(in_h, -in_h);
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);
+  in_w = std::max(in_w, -in_w);
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void ReplicatePad3DGradNCDHW(T* d_in_data, const T* d_out_data,
+                             const int in_depth, const int in_height,
+                             const int in_width, const int out_depth,
+                             const int out_height, const int out_width,
+                             const int pad_front, const int pad_top,
+                             const int pad_left, const int out_d,
+                             const int out_h, const int out_w) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void ReplicatePad3DGradNDHWC(T* d_in_data, const T* d_out_data,
+                             const int channels, const int in_depth,
+                             const int in_height, const int in_width,
+                             const int out_depth, const int out_height,
+                             const int out_width, const int pad_front,
+                             const int pad_top, const int pad_left,
+                             const int out_d, const int out_h,
+                             const int out_w) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void CircularPad3DGradNCDHW(T* d_in_data, const T* d_out_data,
+                            const int in_depth, const int in_height,
+                            const int in_width, const int out_depth,
+                            const int out_height, const int out_width,
+                            const int pad_front, const int pad_top,
+                            const int pad_left, const int out_d,
+                            const int out_h, const int out_w) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void CircularPad3DGradNDHWC(T* d_in_data, const T* d_out_data,
+                            const int channels, const int in_depth,
+                            const int in_height, const int in_width,
+                            const int out_depth, const int out_height,
+                            const int out_width, const int pad_front,
+                            const int pad_top, const int pad_left,
+                            const int out_d, const int out_h, const int out_w) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void Pad3DGradNCDHW(T* d_in_data, const int num, const int channels,
+                    const int in_depth, const int in_height, const int in_width,
+                    const int out_depth, const int out_height,
+                    const int out_width, const int pad_front, const int pad_top,
+                    const int pad_left, const T* d_out_data,
+                    void (*pad_func)(T*, const T*, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int)) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_d = 0; out_d < out_depth; ++out_d) {
+        for (int out_h = 0; out_h < out_height; ++out_h) {
+          for (int out_w = 0; out_w < out_width; ++out_w) {
+            pad_func(d_in_data, d_out_data, in_depth, in_height, in_width,
+                     out_depth, out_height, out_width, pad_front, pad_top,
+                     pad_left, out_d, out_h, out_w);
+          }
+        }
+      }
+      d_in_data += in_depth * in_height * in_width;
+      d_out_data += out_depth * out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad3DGradNDHWC(T* d_in_data, const int num, const int channels,
+                    const int in_depth, const int in_height, const int in_width,
+                    const int out_depth, const int out_height,
+                    const int out_width, const int pad_front, const int pad_top,
+                    const int pad_left, const T* d_out_data,
+                    void (*pad_func)(T*, const T*, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int)) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_d = 0; out_d < out_depth; ++out_d) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          pad_func(d_in_data, d_out_data, channels, in_depth, in_height,
+                   in_width, out_depth, out_height, out_width, pad_front,
+                   pad_top, pad_left, out_d, out_h, out_w);
+        }
+      }
+    }
+    d_in_data += in_depth * in_height * in_width * channels;
+    d_out_data += out_depth * out_height * out_width * channels;
+  }
+}
+
+static inline std::vector<int> GetPaddings(
+    const framework::ExecutionContext& context) {
+  std::vector<int> paddings(6);
+  auto* paddings_t = context.Input<Tensor>("Paddings");
+  if (paddings_t) {
+    auto paddings_data = paddings_t->data<int>();
+    std::memcpy(paddings.data(), paddings_data, paddings.size() * sizeof(int));
+  } else {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    std::copy(pads.begin(), pads.end(), paddings.data());
+  }
+  return paddings;
+}
+
+template <typename T>
+class Pad3dCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    std::vector<int> pads = GetPaddings(context);
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    T value = static_cast<T>(context.Attr<float>("value"));
+
+    auto* x = context.Input<Tensor>("X");
+    auto in_dims = x->dims();
+    const T* in_data = x->data<T>();
+
+    auto* out = context.Output<Tensor>("Out");
+    if (data_format == "NCDHW") {
+      out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[4] + pads[5],
+                   in_dims[3] + pads[2] + pads[3],
+                   in_dims[4] + pads[0] + pads[1]});
+    } else {
+      out->Resize({in_dims[0], in_dims[1] + pads[4] + pads[5],
+                   in_dims[2] + pads[2] + pads[3],
+                   in_dims[3] + pads[0] + pads[1], in_dims[4]});
+    }
+    auto out_dims = out->dims();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+
+    int channels = in_dims[1];
+    int in_depth = in_dims[2];
+    int in_height = in_dims[3];
+    int in_width = in_dims[4];
+    int out_depth = out_dims[2];
+    int out_height = out_dims[3];
+    int out_width = out_dims[4];
+    if (data_format == "NDHWC") {
+      channels = in_dims[4];
+      in_depth = in_dims[1];
+      in_height = in_dims[2];
+      in_width = in_dims[3];
+      out_depth = out_dims[1];
+      out_height = out_dims[2];
+      out_width = out_dims[3];
+    }
+
+    if (mode == "reflect") {
+      PADDLE_ENFORCE_GT(in_depth, pads[4],
+                        platform::errors::InvalidArgument(
+                            "The depth of Input(X)'s dimension should be "
+                            "greater than pad_front"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_front(%d).",
+                            in_depth, pads[4]));
+      PADDLE_ENFORCE_GT(in_depth, pads[5],
+                        platform::errors::InvalidArgument(
+                            "The depth of Input(X)'s dimension should be "
+                            "greater than pad_back"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_back(%d).",
+                            in_depth, pads[5]));
+
+      PADDLE_ENFORCE_GT(in_height, pads[2],
+                        platform::errors::InvalidArgument(
+                            "The height of Input(X)'s dimension should be "
+                            "greater than pad_top"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_top(%d).",
+                            in_height, pads[2]));
+      PADDLE_ENFORCE_GT(in_height, pads[3],
+                        platform::errors::InvalidArgument(
+                            "The height of Input(X)'s dimension should be "
+                            "greater than pad_bottom"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_bottom(%d).",
+                            in_height, pads[3]));
+
+      PADDLE_ENFORCE_GT(in_width, pads[0],
+                        platform::errors::InvalidArgument(
+                            "The width of Input(X)'s dimension should be "
+                            "greater than pad_left"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_left(%d).",
+                            in_width, pads[0]));
+      PADDLE_ENFORCE_GT(in_width, pads[1],
+                        platform::errors::InvalidArgument(
+                            "The width of Input(X)'s dimension should be "
+                            "greater than pad_right"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_right(%d).",
+                            in_width, pads[1]));
+    }
+
+    const int pad_left = pads[0];
+    const int pad_top = pads[2];
+    const int pad_front = pads[4];
+    const int num = in_dims[0];
+    if (data_format == "NCDHW") {
+      std::map<std::string,
+               void (*)(const T*, T*, const int, const int, const int,
+                        const int, const int, const int, const int, const int,
+                        const int, const int, const int, const int, const T)>
+          func_map;
+
+      func_map["reflect"] = ReflectPad3DFuncNCDHW;
+      func_map["replicate"] = ReplicatePad3DFuncNCDHW;
+      func_map["circular"] = CircularPad3DFuncNCDHW;
+      func_map["constant"] = ConstPad3DFuncNCDHW;
+      Pad3DNCDHW(in_data, num, channels, in_depth, in_height, in_width,
+                 out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+                 value, out_data, func_map[mode]);
+    } else {
+      std::map<std::string, void (*)(const T*, T*, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int, const T)>
+          func_map;
+
+      func_map["reflect"] = ReflectPad3DFuncNDHWC;
+      func_map["replicate"] = ReplicatePad3DFuncNDHWC;
+      func_map["circular"] = CircularPad3DFuncNDHWC;
+      func_map["constant"] = ConstPad3DFuncNDHWC;
+      Pad3DNDHWC(in_data, num, channels, in_depth, in_height, in_width,
+                 out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+                 value, out_data, func_map[mode]);
+    }
+  }
+};
+
+template <typename T>
+class Pad3dGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    std::vector<int> pads = GetPaddings(context);
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto d_in_dims = d_in->dims();
+    auto d_out_dims = d_out->dims();
+    const T* d_out_data = d_out->data<T>();
+    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
+    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(context.template device_context<platform::CPUDeviceContext>(),
+             d_in, static_cast<T>(0));
+    const int pad_left = pads[0];
+    const int pad_top = pads[2];
+    const int pad_front = pads[4];
+    const int num = d_in_dims[0];
+    if (data_format == "NCDHW") {
+      const int channels = d_in_dims[1];
+      const int in_depth = d_in_dims[2];
+      const int in_height = d_in_dims[3];
+      const int in_width = d_in_dims[4];
+      const int out_depth = d_out_dims[2];
+      const int out_height = d_out_dims[3];
+      const int out_width = d_out_dims[4];
+
+      std::map<std::string,
+               void (*)(T*, const T*, const int, const int, const int,
+                        const int, const int, const int, const int, const int,
+                        const int, const int, const int, const int)>
+          func_map;
+
+      func_map["reflect"] = ReflectPad3DGradNCDHW;
+      func_map["replicate"] = ReplicatePad3DGradNCDHW;
+      func_map["circular"] = CircularPad3DGradNCDHW;
+      func_map["constant"] = ConstPad3DGradNCDHW;
+
+      Pad3DGradNCDHW(d_in_data, num, channels, in_depth, in_height, in_width,
+                     out_depth, out_height, out_width, pad_front, pad_top,
+                     pad_left, d_out_data, func_map[mode]);
+    } else {
+      const int channels = d_in_dims[4];
+      const int in_depth = d_in_dims[1];
+      const int in_height = d_in_dims[2];
+      const int in_width = d_in_dims[3];
+      const int out_depth = d_out_dims[1];
+      const int out_height = d_out_dims[2];
+      const int out_width = d_out_dims[3];
+
+      std::map<std::string,
+               void (*)(T*, const T*, const int, const int, const int,
+                        const int, const int, const int, const int, const int,
+                        const int, const int, const int, const int, const int)>
+          func_map;
+
+      func_map["reflect"] = ReflectPad3DGradNDHWC;
+      func_map["replicate"] = ReplicatePad3DGradNDHWC;
+      func_map["circular"] = CircularPad3DGradNDHWC;
+      func_map["constant"] = ConstPad3DGradNDHWC;
+
+      Pad3DGradNDHWC(d_in_data, num, channels, in_depth, in_height, in_width,
+                     out_depth, out_height, out_width, pad_front, pad_top,
+                     pad_left, d_out_data, func_map[mode]);
+    }
+  }
+};
+
+class Pad3dOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad3d");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad3d");
+
+    auto x_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dim.size(), 5,
+                      platform::errors::InvalidArgument(
+                          "The size of Input(X)'s dimension should be equal to "
+                          "5, but received %d. ",
+                          x_dim.size()));
+
+    std::vector<int64_t> out_dims(x_dim.size());
+    auto data_format = ctx->Attrs().Get<std::string>("data_format");
+    out_dims[0] = x_dim[0];
+    if (ctx->HasInput("Paddings")) {
+      auto paddings_dim = ctx->GetInputDim("Paddings");
+      PADDLE_ENFORCE_EQ(paddings_dim.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "Size of Input(Paddings)'s dimension should be "
+                            "equal to 1, but received %d.",
+                            paddings_dim.size()));
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(paddings_dim[0], 6,
+                          platform::errors::InvalidArgument(
+                              "Shape of Input(Paddings) should be equal to "
+                              "[6], but received [%d].",
+                              paddings_dim[0]));
+      }
+      out_dims[1] = x_dim[1];
+      out_dims[2] = x_dim[2];
+      out_dims[3] = x_dim[3];
+    } else {
+      auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+      PADDLE_ENFORCE_EQ(
+          paddings.size(), 6,
+          platform::errors::InvalidArgument(
+              "Size of paddings should be equal to 4, but received %d.",
+              static_cast<int>(paddings.size())));
+      if (data_format == "NCDHW") {
+        out_dims[1] = x_dim[1];  // channel
+        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
+                          ? x_dim[2]
+                          : (x_dim[2] + paddings[4] + paddings[5]);  // depth
+
+        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
+                          ? x_dim[3]
+                          : (x_dim[3] + paddings[2] + paddings[3]);  // height
+
+        out_dims[4] = ((!ctx->IsRuntime()) && (x_dim[4] < 0))
+                          ? x_dim[4]
+                          : (x_dim[4] + paddings[0] + paddings[1]);  // width
+      } else {                                                       // NDHWC
+        out_dims[4] = x_dim[4];                                      // channel
+
+        out_dims[1] = ((!ctx->IsRuntime()) && (x_dim[1] < 0))
+                          ? x_dim[1]
+                          : (x_dim[1] + paddings[4] + paddings[5]);  // depth
+        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
+                          ? x_dim[2]
+                          : (x_dim[2] + paddings[2] + paddings[3]);  // height
+        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
+                          ? x_dim[3]
+                          : (x_dim[3] + paddings[0] + paddings[1]);  // width
+      }
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class Pad3dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input of pad3d op. "
+             "The input should be a 5-D tensor with formate NCDHW or NDHWC.");
+    AddOutput("Out",
+              "The output of pad3d op. "
+              "A tensor with the same shape as X.");
+    AddInput("Paddings",
+             "A 1-D tensor to describe the padding rules."
+             "paddings=[0, 1, 2, 3, 4, 5] means "
+             "padding 0 column to left, 1 column to right, "
+             "2 row to top, 3 row to bottom, 4 depth to front "
+             "and 5 depth to back. Size of paddings must be 6.")
+        .AsDispensable();
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>) "
+        "A list<int> to describe the padding rules."
+        "paddings=[0, 1, 2, 3, 4, 5] means "
+        "padding 0 column to left, 1 column to right, "
+        "2 row to top, 3 row to bottom, 4 depth to front "
+        "and 5 depth to back. Size of paddings must be 6.");
+    AddAttr<float>("value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas in constant mode.")
+        .SetDefault(0.0f);
+    AddAttr<std::string>(
+        "mode",
+        "(string, default constant) "
+        "Four modes: constant(default), reflect, replicate, circular.")
+        .SetDefault("constant");
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCDHW) Only used in "
+        "An optional string from: \"NDHWC\", \"NCDHW\". "
+        "Defaults to \"NDHWC\". Specify the data format of the input data.")
+        .SetDefault("NCDHW");
+    AddComment(R"DOC(
+Pad3d Operator.
+Pad 3-d images according to 'paddings' and 'mode'. 
+If mode is 'reflect', paddings[0] and paddings[1] must be no greater
+than width-1. The height and depth dimension have the same condition.
+
+Given that X is a channel of image from input:
+
+X = [[[[[1, 2, 3],
+     [4, 5, 6]]]]]
+
+Case 0:
+
+paddings = [2, 2, 1, 1, 0, 0],
+mode = 'constant'
+pad_value = 0
+
+Out = [[[[[0. 0. 0. 0. 0. 0. 0.]
+          [0. 0. 1. 2. 3. 0. 0.]
+          [0. 0. 4. 5. 6. 0. 0.]
+          [0. 0. 0. 0. 0. 0. 0.]]]]]
+
+Case 1:
+
+paddings = [2, 2, 1, 1, 0, 0],
+mode = 'reflect'
+
+Out = [[[[[6. 5. 4. 5. 6. 5. 4.]
+          [3. 2. 1. 2. 3. 2. 1.]
+          [6. 5. 4. 5. 6. 5. 4.]
+          [3. 2. 1. 2. 3. 2. 1.]]]]]
+
+Case 2:
+
+paddings = [2, 2, 1, 1, 0, 0],
+mode = 'replicate'
+
+Out = [[[[[1. 1. 1. 2. 3. 3. 3.]
+          [1. 1. 1. 2. 3. 3. 3.]
+          [4. 4. 4. 5. 6. 6. 6.]
+          [4. 4. 4. 5. 6. 6. 6.]]]]]
+
+Case 3:
+
+paddings = [2, 2, 1, 1, 0, 0],
+mode = 'circular'
+
+Out = [[[[[5. 6. 4. 5. 6. 4. 5.]
+          [2. 3. 1. 2. 3. 1. 2.]
+          [5. 6. 4. 5. 6. 4. 5.]
+          [2. 3. 1. 2. 3. 1. 2.]]]]]
+
+)DOC");
+  }
+};
+
+class Pad3dOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad3d@Grad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "Pad3d@Grad");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class Pad3dOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> bind) const override {
+    bind->SetInput("X", this->Input("X"));
+    if (this->HasInput("Paddings")) {
+      bind->SetInput("Paddings", this->Input("Paddings"));
+    }
+    bind->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    bind->SetAttrMap(this->Attrs());
+    bind->SetType("pad3d_grad");
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(Pad3dOpGradNoNeedBufferVarsInferer, "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(pad3d, ops::Pad3dOp, ops::Pad3dOpMaker,
+                  ops::Pad3dOpGradMaker<paddle::framework::OpDesc>,
+                  ops::Pad3dOpGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(pad3d_grad, ops::Pad3dOpGrad,
+                  ops::Pad3dOpGradNoNeedBufferVarsInferer);
+REGISTER_OP_CPU_KERNEL(pad3d, ops::Pad3dCPUKernel<float>,
+                       ops::Pad3dCPUKernel<double>, ops::Pad3dCPUKernel<int>,
+                       ops::Pad3dCPUKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL(pad3d_grad, ops::Pad3dGradCPUKernel<float>,
+                       ops::Pad3dGradCPUKernel<double>);
diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..672a75389ccf18d11e508ca94d45128b2e7b56b7
--- /dev/null
+++ b/paddle/fluid/operators/pad3d_op.cu
@@ -0,0 +1,788 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+using framework::Tensor;
+
+template <typename T>
+__global__ void Pad3DConstNCDHW(const int nthreads, const T* in_data,
+                                const int num, const int channels,
+                                const int in_depth, const int in_height,
+                                const int in_width, const int out_depth,
+                                const int out_height, const int out_width,
+                                const int pad_front, const int pad_top,
+                                const int pad_left, T value, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    out_data[index] =
+        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+         in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[nc * in_depth * in_height * in_width +
+                      in_d * in_height * in_width + in_h * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DConstNDHWC(const int nthreads, const T* in_data,
+                                const int num, const int channels,
+                                const int in_depth, const int in_height,
+                                const int in_width, const int out_depth,
+                                const int out_height, const int out_width,
+                                const int pad_front, const int pad_top,
+                                const int pad_left, T value, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+    const int in_d = out_d - pad_front;
+    const int in_h = out_h - pad_top;
+    const int in_w = out_w - pad_left;
+
+    out_data[index] =
+        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+         in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[n * in_depth * in_height * in_width * channels +
+                      in_d * in_height * in_width * channels +
+                      in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReflectNCDHW(const int nthreads, const T* in_data,
+                                  const int num, const int channels,
+                                  const int in_depth, const int in_height,
+                                  const int in_width, const int out_depth,
+                                  const int out_height, const int out_width,
+                                  const int pad_front, const int pad_top,
+                                  const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);                     // reflect by 0
+    in_d = min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+    in_h = max(in_h, -in_h);                     // reflect by 0
+    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+    in_w = max(in_w, -in_w);                     // reflect by 0
+    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReflectNDHWC(const int nthreads, const T* in_data,
+                                  const int num, const int channels,
+                                  const int in_depth, const int in_height,
+                                  const int in_width, const int out_depth,
+                                  const int out_height, const int out_width,
+                                  const int pad_front, const int pad_top,
+                                  const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_d = min(in_d, 2 * in_depth - in_d - 2);
+    in_h = max(in_h, -in_h);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = max(in_w, -in_w);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReplicateNCDHW(const int nthreads, const T* in_data,
+                                    const int num, const int channels,
+                                    const int in_depth, const int in_height,
+                                    const int in_width, const int out_depth,
+                                    const int out_height, const int out_width,
+                                    const int pad_front, const int pad_top,
+                                    const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReplicateNDHWC(const int nthreads, const T* in_data,
+                                    const int num, const int channels,
+                                    const int in_depth, const int in_height,
+                                    const int in_width, const int out_depth,
+                                    const int out_height, const int out_width,
+                                    const int pad_front, const int pad_top,
+                                    const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DCircularNCDHW(const int nthreads, const T* in_data,
+                                   const int num, const int channels,
+                                   const int in_depth, const int in_height,
+                                   const int in_width, const int out_depth,
+                                   const int out_height, const int out_width,
+                                   const int pad_front, const int pad_top,
+                                   const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DCircularNDHWC(const int nthreads, const T* in_data,
+                                   const int num, const int channels,
+                                   const int in_depth, const int in_height,
+                                   const int in_width, const int out_depth,
+                                   const int out_height, const int out_width,
+                                   const int pad_front, const int pad_top,
+                                   const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradConstNCDHW(const int in_size, T* d_in_data,
+                                    const int num, const int channels,
+                                    const int in_depth, const int in_height,
+                                    const int in_width, const int out_depth,
+                                    const int out_height, const int out_width,
+                                    const int pad_front, const int pad_top,
+                                    const int pad_left, const T* d_out_data) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
+    const int in_w = in_index % in_width;
+
+    int nc = in_index / in_width;
+    const int in_h = nc % in_height;
+
+    nc /= in_height;
+    const int in_d = nc % in_depth;
+
+    nc /= in_depth;
+
+    const int out_d = in_d + pad_front;
+    const int out_h = in_h + pad_top;
+    const int out_w = in_w + pad_left;
+    d_in_data[in_index] =
+        d_out_data[nc * out_depth * out_height * out_width +
+                   out_d * out_height * out_width + out_h * out_width + out_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradConstNDHWC(const int in_size, T* d_in_data,
+                                    const int num, const int channels,
+                                    const int in_depth, const int in_height,
+                                    const int in_width, const int out_depth,
+                                    const int out_height, const int out_width,
+                                    const int pad_front, const int pad_top,
+                                    const int pad_left, const T* d_out_data) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
+    const int c = in_index % channels;
+    int n = in_index / channels;
+
+    const int in_w = n % in_width;
+    n /= in_width;
+
+    const int in_h = n % in_height;
+    n /= in_height;
+
+    const int in_d = n % in_depth;
+    n /= in_depth;
+
+    const int out_d = in_d + pad_front;
+    const int out_h = in_h + pad_top;
+    const int out_w = in_w + pad_left;
+
+    d_in_data[in_index] =
+        d_out_data[n * out_depth * out_height * out_width * channels +
+                   out_d * out_height * out_width * channels +
+                   out_h * out_width * channels + out_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReflectNCDHW(const int out_size, T* d_in_data,
+                                      const int num, const int channels,
+                                      const int in_depth, const int in_height,
+                                      const int in_width, const int out_depth,
+                                      const int out_height, const int out_width,
+                                      const int pad_front, const int pad_top,
+                                      const int pad_left, const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+
+    in_d = min(in_d, 2 * in_depth - in_d - 2);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+
+    platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReflectNDHWC(const int out_size, T* d_in_data,
+                                      const int num, const int channels,
+                                      const int in_depth, const int in_height,
+                                      const int in_width, const int out_depth,
+                                      const int out_height, const int out_width,
+                                      const int pad_front, const int pad_top,
+                                      const int pad_left, const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+
+    in_d = min(in_d, in_depth * 2 - in_d - 2);
+    in_h = min(in_h, in_height * 2 - in_h - 2);
+    in_w = min(in_w, in_width * 2 - in_w - 2);
+    platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReplicateNCDHW(
+    const int out_size, T* d_in_data, const int num, const int channels,
+    const int in_depth, const int in_height, const int in_width,
+    const int out_depth, const int out_height, const int out_width,
+    const int pad_front, const int pad_top, const int pad_left,
+    const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReplicateNDHWC(
+    const int out_size, T* d_in_data, const int num, const int channels,
+    const int in_depth, const int in_height, const int in_width,
+    const int out_depth, const int out_height, const int out_width,
+    const int pad_front, const int pad_top, const int pad_left,
+    const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradCircularNCDHW(const int out_size, T* d_in_data,
+                                       const int num, const int channels,
+                                       const int in_depth, const int in_height,
+                                       const int in_width, const int out_depth,
+                                       const int out_height,
+                                       const int out_width, const int pad_front,
+                                       const int pad_top, const int pad_left,
+                                       const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradCircularNDHWC(const int out_size, T* d_in_data,
+                                       const int num, const int channels,
+                                       const int in_depth, const int in_height,
+                                       const int in_width, const int out_depth,
+                                       const int out_height,
+                                       const int out_width, const int pad_front,
+                                       const int pad_top, const int pad_left,
+                                       const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+static inline std::vector<int> GetPaddings(
+    const framework::ExecutionContext& context) {
+  std::vector<int> paddings(6);
+  auto* paddings_data = context.Input<Tensor>("Paddings");
+  if (paddings_data) {
+    Tensor pads;
+    framework::TensorCopySync(*paddings_data, platform::CPUPlace(), &pads);
+    auto pads_data = pads.data<int>();
+    std::memcpy(paddings.data(), pads_data, paddings.size() * sizeof(int));
+  } else {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    std::copy(pads.begin(), pads.end(), paddings.data());
+  }
+  return paddings;
+}
+
+template <typename T>
+class Pad3dCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    std::vector<int> pads = GetPaddings(context);
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    T value = static_cast<T>(context.Attr<float>("value"));
+
+    auto* x = context.Input<Tensor>("X");
+    auto in_dims = x->dims();
+    const T* in_data = x->data<T>();
+    auto* out = context.Output<Tensor>("Out");
+    auto out_dims = out->dims();
+    if (data_format == "NCDHW") {
+      out_dims[0] = in_dims[0];
+      out_dims[1] = in_dims[1];
+      out_dims[2] = in_dims[2] + pads[4] + pads[5];
+      out_dims[3] = in_dims[3] + pads[2] + pads[3];
+      out_dims[4] = in_dims[4] + pads[0] + pads[1];
+    } else {
+      out_dims[0] = in_dims[0];
+      out_dims[1] = in_dims[1] + pads[4] + pads[5];
+      out_dims[2] = in_dims[2] + pads[2] + pads[3];
+      out_dims[3] = in_dims[3] + pads[0] + pads[1];
+      out_dims[4] = in_dims[4];
+    }
+    T* out_data = out->mutable_data<T>(out_dims, context.GetPlace());
+
+    int channels = in_dims[1];
+    int in_depth = in_dims[2];
+    int in_height = in_dims[3];
+    int in_width = in_dims[4];
+    int out_depth = out_dims[2];
+    int out_height = out_dims[3];
+    int out_width = out_dims[4];
+    if (data_format == "NDHWC") {
+      channels = in_dims[4];
+      in_depth = in_dims[1];
+      in_height = in_dims[2];
+      in_width = in_dims[3];
+      out_depth = out_dims[1];
+      out_height = out_dims[2];
+      out_width = out_dims[3];
+    }
+
+    if (mode == "reflect") {
+      PADDLE_ENFORCE_GT(in_depth, pads[4],
+                        platform::errors::InvalidArgument(
+                            "The depth of Input(X)'s dimension should be "
+                            "greater than pad_front"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_front(%d).",
+                            in_depth, pads[4]));
+      PADDLE_ENFORCE_GT(in_depth, pads[5],
+                        platform::errors::InvalidArgument(
+                            "The depth of Input(X)'s dimension should be "
+                            "greater than pad_back"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_back(%d).",
+                            in_depth, pads[5]));
+
+      PADDLE_ENFORCE_GT(in_height, pads[2],
+                        platform::errors::InvalidArgument(
+                            "The height of Input(X)'s dimension should be "
+                            "greater than pad_top"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_top(%d).",
+                            in_height, pads[2]));
+      PADDLE_ENFORCE_GT(in_height, pads[3],
+                        platform::errors::InvalidArgument(
+                            "The height of Input(X)'s dimension should be "
+                            "greater than pad_bottom"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_bottom(%d).",
+                            in_height, pads[3]));
+
+      PADDLE_ENFORCE_GT(in_width, pads[0],
+                        platform::errors::InvalidArgument(
+                            "The width of Input(X)'s dimension should be "
+                            "greater than pad_left"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_left(%d).",
+                            in_width, pads[0]));
+      PADDLE_ENFORCE_GT(in_width, pads[1],
+                        platform::errors::InvalidArgument(
+                            "The width of Input(X)'s dimension should be "
+                            "greater than pad_right"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_right(%d).",
+                            in_width, pads[1]));
+    }
+
+    const int pad_left = pads[0];
+    const int pad_top = pads[2];
+    const int pad_front = pads[4];
+    const int num = in_dims[0];
+
+    auto stream = context.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    const int out_size = out->numel();
+    int grid = (out_size + block - 1) / block;
+
+    if (data_format == "NCDHW") {
+      if (mode == "reflect") {
+        Pad3DReflectNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else if (mode == "replicate") {
+        Pad3DReplicateNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else if (mode == "circular") {
+        Pad3DCircularNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else {
+        Pad3DConstNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            value, out_data);
+      }
+    } else {
+      if (mode == "reflect") {
+        Pad3DReflectNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else if (mode == "replicate") {
+        Pad3DReplicateNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else if (mode == "circular") {
+        Pad3DCircularNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else {
+        Pad3DConstNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            value, out_data);
+      }
+    }
+  }
+};
+
+template <typename T>
+class Pad3dGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    std::vector<int> pads = GetPaddings(context);
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto d_in_dims = d_in->dims();
+    auto d_out_dims = d_out->dims();
+    const T* d_out_data = d_out->data<T>();
+    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(context.template device_context<platform::CUDADeviceContext>(),
+             d_in, static_cast<T>(0));
+
+    const int pad_left = pads[0];
+    const int pad_top = pads[2];
+    const int pad_front = pads[4];
+
+    const int num = d_in_dims[0];
+
+    auto stream = context.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    const int out_size = d_out->numel();
+    const int in_size = d_in->numel();
+    int grid = (out_size + block - 1) / block;
+
+    if (data_format == "NCDHW") {
+      const int channels = d_in_dims[1];
+      const int in_depth = d_in_dims[2];
+      const int in_height = d_in_dims[3];
+      const int in_width = d_in_dims[4];
+      const int out_depth = d_out_dims[2];
+      const int out_height = d_out_dims[3];
+      const int out_width = d_out_dims[4];
+
+      if (mode == "reflect") {
+        Pad3DGradReflectNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else if (mode == "replicate") {
+        Pad3DGradReplicateNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else if (mode == "circular") {
+        Pad3DGradCircularNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else {
+        grid = (in_size + block - 1) / block;
+        Pad3DGradConstNCDHW<T><<<grid, block, 0, stream>>>(
+            in_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      }
+    } else {
+      const int channels = d_in_dims[4];
+      const int in_depth = d_in_dims[1];
+      const int in_height = d_in_dims[2];
+      const int in_width = d_in_dims[3];
+      const int out_depth = d_out_dims[1];
+      const int out_height = d_out_dims[2];
+      const int out_width = d_out_dims[3];
+      if (mode == "reflect") {
+        Pad3DGradReflectNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else if (mode == "replicate") {
+        Pad3DGradReplicateNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else if (mode == "circular") {
+        Pad3DGradCircularNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else {
+        grid = (in_size + block - 1) / block;
+        Pad3DGradConstNDHWC<T><<<grid, block, 0, stream>>>(
+            in_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(pad3d, ops::Pad3dCUDAKernel<plat::float16>,
+                        ops::Pad3dCUDAKernel<float>,
+                        ops::Pad3dCUDAKernel<double>, ops::Pad3dCUDAKernel<int>,
+                        ops::Pad3dCUDAKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL(pad3d_grad, ops::Pad3dGradCUDAKernel<plat::float16>,
+                        ops::Pad3dGradCUDAKernel<float>,
+                        ops::Pad3dGradCUDAKernel<double>);
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index 1ed7988dcfcc0831156c09a72e958852f3d45fb5..70d232ad6a51e21b863974e70920eb2d9da895e6 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -28,25 +28,44 @@ class PixelShuffleOp : public framework::OperatorWithKernel {
                           "Output(Out) of PixelShuffleOp should not be null."));
 
     auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "Input should be a 4-D tensor of format [N, C, H, W], but got %u.",
-            input_dims.size()));
+    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
+                      platform::errors::InvalidArgument(
+                          "Input should be a 4-D tensor of format [N, C, H, W] "
+                          "or [N, H, W, C], but got %u.",
+                          input_dims.size()));
 
     auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
 
-    PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor), 0,
-                      platform::errors::InvalidArgument(
-                          "The square of upscale_factor[%u] should divide the "
-                          "number of channel[%u]",
-                          input_dims[1], upscale_factor * upscale_factor));
-
+    const std::string data_format =
+        ctx->Attrs().Get<std::string>("data_format");
+    const bool channel_last = (data_format == "NHWC");
+
+    if (!channel_last) {
+      PADDLE_ENFORCE_EQ(
+          input_dims[1] % (upscale_factor * upscale_factor), 0,
+          platform::errors::InvalidArgument(
+              "The square of upscale_factor[%u] should divide the "
+              "number of channel[%u]",
+              input_dims[1], upscale_factor * upscale_factor));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          input_dims[3] % (upscale_factor * upscale_factor), 0,
+          platform::errors::InvalidArgument(
+              "The square of upscale_factor[%u] should divide the "
+              "number of channel[%u]",
+              input_dims[3], upscale_factor * upscale_factor));
+    }
     auto output_dims = input_dims;
     output_dims[0] = input_dims[0];
-    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
-    output_dims[2] = input_dims[2] * upscale_factor;
-    output_dims[3] = input_dims[3] * upscale_factor;
+    if (!channel_last) {
+      output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
+      output_dims[2] = input_dims[2] * upscale_factor;
+      output_dims[3] = input_dims[3] * upscale_factor;
+    } else {
+      output_dims[1] = input_dims[1] * upscale_factor;
+      output_dims[2] = input_dims[2] * upscale_factor;
+      output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor);
+    }
     ctx->SetOutputDim("Out", output_dims);
   }
 };
@@ -54,14 +73,14 @@ class PixelShuffleOp : public framework::OperatorWithKernel {
 class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput(
-        "X",
-        "(Tensor, default Tensor<float>), "
-        "the input feature data of PixelShuffleOp, the layout is [N C H W].");
-    AddOutput(
-        "Out",
-        "(Tensor, default Tensor<float>), the output of "
-        "PixelShuffleOp. The layout is [N,C/factor^2,H*factor,W*factor].");
+    AddInput("X",
+             "(Tensor, default Tensor<float>), "
+             "the input feature data of PixelShuffleOp, the layout is [N, C, "
+             "H, W] or [N, H, W, C].");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), the output of "
+              "PixelShuffleOp. The layout is [N, C/factor^2, H*factor, "
+              "W*factor] or [N, H*factor, W*factor, C/factor^2].");
     AddAttr<int>("upscale_factor",
                  "the factor to increase spatial resolution by.")
         .SetDefault(1)
@@ -70,6 +89,11 @@ class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
                             platform::errors::InvalidArgument(
                                 "upscale_factor should be larger than 0."));
         });
+    AddAttr<std::string>(
+        "data_format",
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\", Specify the data format of the input data.")
+        .SetDefault("NCHW");
 
     AddComment(R"DOC(
 		Pixel Shuffle operator
@@ -114,19 +138,30 @@ class PixelShuffleGradOp : public framework::OperatorWithKernel {
         platform::errors::NotFound("Output(X@Grad) should not be null"));
 
     auto do_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(
-        do_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "Input should be a 4-D tensor of format [N, C, H, W], but got %u.",
-            do_dims.size()));
+    PADDLE_ENFORCE_EQ(do_dims.size(), 4,
+                      platform::errors::InvalidArgument(
+                          "Input should be a 4-D tensor of format [N, C, H, W] "
+                          "or [N, H, W, C], but got %u.",
+                          do_dims.size()));
 
     auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
 
+    const std::string data_format =
+        ctx->Attrs().Get<std::string>("data_format");
+    const bool channel_last = (data_format == "NHWC");
+
     auto dx_dims = do_dims;
     dx_dims[0] = do_dims[0];
-    dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
-    dx_dims[2] = do_dims[2] / upscale_factor;
-    dx_dims[3] = do_dims[3] / upscale_factor;
+
+    if (!channel_last) {
+      dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
+      dx_dims[2] = do_dims[2] / upscale_factor;
+      dx_dims[3] = do_dims[3] / upscale_factor;
+    } else {
+      dx_dims[1] = do_dims[1] / upscale_factor;
+      dx_dims[2] = do_dims[2] / upscale_factor;
+      dx_dims[3] = do_dims[3] * (upscale_factor * upscale_factor);
+    }
     ctx->SetOutputDim(framework::GradVarName("X"), dx_dims);
   }
 };
diff --git a/paddle/fluid/operators/pixel_shuffle_op.h b/paddle/fluid/operators/pixel_shuffle_op.h
index 1ae1c7e9d50cb9d701fd0e79337a1906f2f5d545..b2a0db0f838d5dcc3fed2ed9838f1c43240ce0e7 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.h
+++ b/paddle/fluid/operators/pixel_shuffle_op.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -24,23 +25,33 @@ class PixelShuffleOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto* out = ctx.Output<framework::Tensor>("Out");
+
     out->mutable_data<T>(ctx.GetPlace());
 
     int factor = ctx.Attr<int>("upscale_factor");
 
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    bool channel_last = (data_format == "NHWC");
+
     auto in_dims = in->dims();
     auto o_dims = out->dims();
 
     framework::Tensor t;
     t.ShareDataWith(*in);
-    t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
-
+    if (!channel_last) {
+      t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
+    } else {
+      t.Resize({in_dims[0], in_dims[1], in_dims[2], o_dims[3], factor, factor});
+    }
     std::vector<int> axis = {0, 1, 4, 2, 5, 3};
 
     framework::Tensor o;
     o.ShareDataWith(*out);
-    o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
-
+    if (!channel_last) {
+      o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
+    } else {
+      o.Resize({in_dims[0], in_dims[1], factor, in_dims[2], factor, o_dims[3]});
+    }
     math::Transpose<DeviceContext, T, 6> trans;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     trans(dev_ctx, t, &o, axis);
@@ -58,19 +69,32 @@ class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
 
     int factor = ctx.Attr<int>("upscale_factor");
 
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    bool channel_last = (data_format == "NHWC");
+
     auto do_dims = dout->dims();
     auto dx_dims = dx->dims();
 
     framework::Tensor t;
     t.ShareDataWith(*dout);
-    t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
-
+    if (!channel_last) {
+      t.Resize(
+          {do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
+    } else {
+      t.Resize(
+          {do_dims[0], dx_dims[1], factor, dx_dims[2], factor, do_dims[3]});
+    }
     std::vector<int> axis = {0, 1, 3, 5, 2, 4};
 
     framework::Tensor o;
     o.ShareDataWith(*dx);
-    o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
-
+    if (!channel_last) {
+      o.Resize(
+          {do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
+    } else {
+      o.Resize(
+          {do_dims[0], dx_dims[1], dx_dims[2], do_dims[3], factor, factor});
+    }
     math::Transpose<DeviceContext, T, 6> trans;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     trans(dev_ctx, t, &o, axis);
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index 354e5c60a6b9ed80f0f8c44439294bfa2731a423..7749903e5f36f1d93f7e111da4587d6828d445a4 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -111,7 +111,8 @@ static void CallPythonFunc(py::object *callable,
       out->set_lod(py_out_tensor->lod());
       out->ShareDataWith(*py_out_tensor);
     } catch (py::cast_error &) {
-      PADDLE_THROW("The %d-th output must be LoDTensor", i);
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The %d-th output must be LoDTensor.", i));
     }
   }
 }
diff --git a/paddle/fluid/operators/randint_op.cc b/paddle/fluid/operators/randint_op.cc
index 11ce738e001517a378769dd99e8feffd7b9f19be..b3a2e14331955f42761601cabe52311e40c28eaa 100644
--- a/paddle/fluid/operators/randint_op.cc
+++ b/paddle/fluid/operators/randint_op.cc
@@ -15,6 +15,7 @@
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/uniform_random_op.h"
@@ -43,15 +44,14 @@ class CPURandintKernel : public framework::OpKernel<T> {
     T* data = out->mutable_data<T>(ctx.GetPlace());
     int64_t size = out->numel();
 
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
     std::uniform_int_distribution<T> dist(ctx.Attr<int>("low"),
                                           ctx.Attr<int>("high") - 1);
-    for (int64_t i = 0; i < size; ++i) data[i] = dist(engine);
+    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
+
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/randperm_op.h b/paddle/fluid/operators/randperm_op.h
index 64ef1c771423f2d820c73df8ed9ff25834f07875..02aabb9a7b569c5aa8354f191c0de1497d9f9324 100644
--- a/paddle/fluid/operators/randperm_op.h
+++ b/paddle/fluid/operators/randperm_op.h
@@ -19,6 +19,8 @@ limitations under the License. */
 #include <ctime>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/place.h"
@@ -28,14 +30,12 @@ namespace operators {
 
 template <typename T>
 static inline void random_permate(T* data_ptr, int num, unsigned int seed) {
+  auto engine = framework::GetCPURandomEngine(seed);
   for (int i = 0; i < num; ++i) {
     data_ptr[i] = static_cast<T>(i);
   }
-  if (seed == 0) {
-    seed = std::random_device()();
-  }
-  std::srand(seed);
-  std::random_shuffle(data_ptr, data_ptr + num);
+
+  std::shuffle(data_ptr, data_ptr + num, *engine);
 }
 
 template <typename DeviceContext, typename T>
@@ -51,6 +51,7 @@ class RandpermKernel : public framework::OpKernel<T> {
     if (platform::is_cpu_place(ctx.GetPlace())) {
       T* out_data = out_tensor->mutable_data<T>(platform::CPUPlace());
       random_permate<T>(out_data, n, seed);
+
     } else {
       framework::Tensor tmp_tensor;
       tmp_tensor.Resize(framework::make_ddim({n}));
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..322a1637f5deec909db13f1bd0433446cd7606ae
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class LogsumexpOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "logsumexp"; }
+  virtual std::string GetOpType() const { return "Reduce logsumexp"; }
+};
+
+template <typename T>
+class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("logsumexp_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Out", this->Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetAttrMap(this->Attrs());
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(logsumexp, ops::ReduceOp, ops::LogsumexpOpMaker,
+                  ops::LogsumexpGradOpMaker<paddle::framework::OpDesc>,
+                  ops::LogsumexpGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(logsumexp_grad, ops::ReduceGradOp);
+
+REGISTER_OP_CPU_KERNEL(logsumexp,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::LogsumexpFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::LogsumexpFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                          float, ops::LogsumexpGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::LogsumexpGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c25e5d01b2758a96192d6fbf8f4e881770cbbbf0
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
+#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
+
+REGISTER_OP_CUDA_KERNEL(logsumexp,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::LogsumexpFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::LogsumexpFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::LogsumexpGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::LogsumexpGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.h b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d0e00262a37ff7160abd7a865e63377f8b30461
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct LogsumexpFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    auto x_dim = x->dimensions();
+    auto t_dim = x_dim;
+    for (int i = 0; i < static_cast<int>(dim.size()); i++) {
+      t_dim[dim[i]] = 1;
+    }
+
+    auto r_dim = x_dim;
+    for (int i = 0; i < static_cast<int>(r_dim.size()); i++) {
+      r_dim[i] = 1;
+    }
+    for (int i = 0; i < static_cast<int>(dim.size()); i++) {
+      r_dim[dim[i]] = x_dim[dim[i]];
+    }
+
+    auto y_dim = y->dimensions();
+    auto x_max = x->maximum(dim);
+    y->device(place) =
+        (x_max +
+         (*x - x_max.reshape(t_dim).broadcast(r_dim)).exp().sum(dim).log())
+            .reshape(y_dim);
+  }
+};
+
+struct LogsumexpGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) * (*x - y->broadcast(dim)).exp();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index fccf6d46895ff46c40d0a5c20d4cf1b614ad8a9e..fdb2c57385b2bc1068c618f206bfeb6513d3d8c4 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -103,11 +103,7 @@ REGISTER_OP_CPU_KERNEL(reduce_mean,
                        ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                          float, ops::MeanFunctor>,
                        ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::MeanFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int, ops::MeanFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int64_t, ops::MeanFunctor>);
+                                         double, ops::MeanFunctor>);
 
 template <typename T>
 using CPUReduceMeanGradKernel =
@@ -115,6 +111,4 @@ using CPUReduceMeanGradKernel =
                           ops::MeanGradFunctor, true>;
 
 REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel<float>,
-                       CPUReduceMeanGradKernel<double>,
-                       CPUReduceMeanGradKernel<int>,
-                       CPUReduceMeanGradKernel<int64_t>);
+                       CPUReduceMeanGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
index 4d3bce8fdd05e536baa5fecb4fc5a117e2031224..cc3653fcb43a4c000d0c61c9d854965fafd59a9c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
@@ -66,6 +66,4 @@ class ReduceMeanKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
-                        ops::ReduceMeanKernel<double>,
-                        ops::ReduceMeanKernel<int>,
-                        ops::ReduceMeanKernel<int64_t>);
+                        ops::ReduceMeanKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
index 12eceb33ec27298d60713e72c9cc2cf91a5e7cfb..289f574719ff03b1b09f313d05bab152f5c5d651 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
@@ -21,6 +21,4 @@ using CUDAReduceMeanGradKernel =
                           ops::MeanGradFunctor, true>;
 
 REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<float>,
-                        CUDAReduceMeanGradKernel<double>,
-                        CUDAReduceMeanGradKernel<int>,
-                        CUDAReduceMeanGradKernel<int64_t>);
+                        CUDAReduceMeanGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index d70df5cd73847e5f63ce0b44b57dbb840d98b522..67a19cb83c36f9cb6ef0cdd65e9fc04a7bb4d169 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -236,8 +236,8 @@ class ReduceGradKernel : public framework::OpKernel<T> {
 
     if (reduce_all) {
       auto x = EigenVector<T>::Flatten(*input0);
-      auto x_reduce = EigenVector<T>::From(*input1);
-      auto x_reduce_grad = EigenVector<T>::From(*input2);
+      auto x_reduce = EigenVector<T>::Flatten(*input1);
+      auto x_reduce_grad = EigenVector<T>::Flatten(*input2);
       auto x_grad = EigenVector<T>::Flatten(*output);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
@@ -334,6 +334,12 @@ class ReduceOp : public framework::OperatorWithKernel {
                             "range [-dimension(X), dimension(X)] "
                             "which dimesion = %d. But received dim index = %d.",
                             i, x_rank, dims[i]));
+      PADDLE_ENFORCE_GE(dims[i], -x_rank,
+                        platform::errors::InvalidArgument(
+                            "The reduce dim index %d should be in the "
+                            "range [-dimension(X), dimension(X)] "
+                            "which dimesion = %d. But received dim index = %d.",
+                            i, x_rank, dims[i]));
       if (dims[i] < 0) dims[i] = x_rank + dims[i];
     }
     sort(dims.begin(), dims.end());
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index c0fbc336e46b64fc6ee43ef1a7372e413c5c3213..1c493fc6be093a2af8f58c8e78d1be43de34306f 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -29,6 +29,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/framework/variable.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+DECLARE_bool(use_mkldnn);
 
 namespace paddle {
 namespace operators {
@@ -262,6 +267,9 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     }
     VLOG(2) << "The number of sub scopes after forward: "
             << out_scope_vec->front()->kids().size();
+#ifdef PADDLE_WITH_MKLDNN
+    if (FLAGS_use_mkldnn) DontClearMKLDNNCache(ctx.GetPlace());
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index 5ec32c98f7f84abb255ec996d0cf6a58e6312ec3..9bec08f593afebae736fcbb3eb42fc20992df779 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -21,6 +21,7 @@
 #include <sstream>
 #include <vector>
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -50,18 +51,15 @@ class SamplingIdKernel : public framework::OpKernel<T> {
     framework::TensorToVector(*input, context.device_context(), &ins_vector);
 
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
+
     std::uniform_real_distribution<T> dist(
         static_cast<T>(context.Attr<float>("min")),
         static_cast<T>(context.Attr<float>("max")));
 
+    auto engine = framework::GetCPURandomEngine(seed);
     std::vector<int64_t> ids(batch_size);
     for (int i = 0; i < batch_size; ++i) {
-      T r = dist(engine);
+      T r = dist(*engine);
       int idx = width - 1;
       for (int j = 0; j < width; ++j) {
         if ((r -= ins_vector[i * width + j]) < 0) {
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index 06eaca0216b36a50028fd7cfd3c0866a5b7c1de0..b45fa7c791ff22be422ce12a8348a071c60ddd0f 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -54,5 +54,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(size, ops::SizeKernel<int>, ops::SizeKernel<int32_t>,
+                       ops::SizeKernel<paddle::platform::float16>,
                        ops::SizeKernel<float>, ops::SizeKernel<double>,
                        ops::SizeKernel<bool>);
diff --git a/paddle/fluid/operators/size_op.cu b/paddle/fluid/operators/size_op.cu
index 4e5846660e62543638b669d586a92fc36b0c8e87..3ea3032693236d5618ff6f0c858cbd85e34633ab 100644
--- a/paddle/fluid/operators/size_op.cu
+++ b/paddle/fluid/operators/size_op.cu
@@ -14,8 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/size_op.h"
 
-REGISTER_OP_CUDA_KERNEL(size, paddle::operators::SizeKernel<int>,
-                        paddle::operators::SizeKernel<int32_t>,
-                        paddle::operators::SizeKernel<float>,
-                        paddle::operators::SizeKernel<bool>,
-                        paddle::operators::SizeKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    size, paddle::operators::SizeKernel<int>,
+    paddle::operators::SizeKernel<int32_t>,
+    paddle::operators::SizeKernel<paddle::platform::float16>,
+    paddle::operators::SizeKernel<float>, paddle::operators::SizeKernel<bool>,
+    paddle::operators::SizeKernel<double>);
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 7528422fdc09b7894898bdee94eaa11ad2cba311..f20bada8ab288fe74fd8ca82a73522a22b234191 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
index f416aa6e00f5a4a82c2562c36f9d32bb1a6843aa..cc2fe4cdbdb8faa69abad28fbdd31dc4e61bdc04 100644
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
@@ -41,12 +41,12 @@ static void InitRandom(framework::Tensor *tensor,
 
 template <typename T>
 struct LeakyReluGradGradEachElementFunctor {
-  LeakyReluGradGradEachElementFunctor(const T *ddx, const T *out, T alpha,
+  LeakyReluGradGradEachElementFunctor(const T *ddx, const T *x, T alpha,
                                       T *ddout)
-      : ddx_(ddx), out_(out), alpha_(alpha), ddout_(ddout) {}
+      : ddx_(ddx), x_(x), alpha_(alpha), ddout_(ddout) {}
 
   HOSTDEVICE void operator()(int idx) {
-    if (out_[idx] > 0) {
+    if (x_[idx] >= 0) {
       ddout_[idx] = ddx_[idx];
     } else {
       ddout_[idx] = ddx_[idx] * alpha_;
@@ -54,7 +54,7 @@ struct LeakyReluGradGradEachElementFunctor {
   }
 
   const T *ddx_;
-  const T *out_;
+  const T *x_;
   T alpha_;
   T *ddout_;
 };
@@ -66,13 +66,13 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
   LeakyReluGradGradFunctor<T> functor;
   functor.alpha = alpha;
   auto &dev_ctx = *platform::DeviceContextPool::Instance().Get(place);
-  framework::Tensor *x = nullptr;
+  framework::Tensor *out = nullptr;
   framework::Tensor *dout = nullptr;
   framework::Tensor *dx = nullptr;
 
-  framework::Tensor out;
-  out.Resize(dim);
-  InitRandom<T>(&out, place);
+  framework::Tensor x;
+  x.Resize(dim);
+  InitRandom<T>(&x, place);
 
   framework::Tensor ddx;
   ddx.Resize(dim);
@@ -85,22 +85,22 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
   framework::Tensor ddout_actual;
   ddout_actual.mutable_data<T>(dim, place);
   LeakyReluGradGradEachElementFunctor<T> actual_functor(
-      ddx.data<T>(), out.data<T>(), static_cast<T>(alpha),
+      ddx.data<T>(), x.data<T>(), static_cast<T>(alpha),
       ddout_actual.data<T>());
 
-  int64_t limit = out.numel();
+  int64_t limit = x.numel();
 
 #ifdef __NVCC__
   if (platform::is_gpu_place(place)) {
     auto &cuda_dev_ctx = dynamic_cast<platform::CUDADeviceContext &>(dev_ctx);
-    functor(cuda_dev_ctx, x, &out, &ddx, &ddout, dout, dx);
+    functor(cuda_dev_ctx, &x, out, &ddx, &ddout, dout, dx);
     platform::ForRange<platform::CUDADeviceContext> for_range(cuda_dev_ctx,
                                                               limit);
     for_range(actual_functor);
   } else {
 #endif
     auto &cpu_dev_ctx = dynamic_cast<platform::CPUDeviceContext &>(dev_ctx);
-    functor(cpu_dev_ctx, x, &out, &ddx, &ddout, dout, dx);
+    functor(cpu_dev_ctx, &x, out, &ddx, &ddout, dout, dx);
     platform::ForRange<platform::CPUDeviceContext> for_range(cpu_dev_ctx,
                                                              limit);
     for_range(actual_functor);
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..57891699fd2ad73a1cccce26438528657afdf340
--- /dev/null
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -0,0 +1,515 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdio.h>
+#include <cstdio>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<paddle::platform::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
+                 paddle::platform::float16> {};
+}  // namespace cub
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+struct SegmentOffsetIter {
+  EIGEN_DEVICE_FUNC
+  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
+    return idx * num_cols_;
+  }
+
+  int num_cols_;
+};
+
+// Iter using into a column
+struct ColumnIndexIter {
+  explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
+      const Eigen::array<int, 1>& ix) const {
+    return ix[0] % num_cols_;
+  }
+
+  int num_cols_;
+};
+
+inline static int GetDesiredBlockDim(int dim) {
+  if (dim > 128) {
+    return 256;
+  } else if (dim > 64) {
+    return 128;
+  } else if (dim > 32) {
+    return 64;
+  } else {
+    return 32;
+  }
+}
+
+template <typename T>
+__global__ void InitIndex(T* indices, T num_rows, T num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (int64_t j = row_id; j < num_rows; j += gridDim.x) {
+    for (int64_t i = col_id; i < num_cols; i += blockDim.x) {
+      indices[j * num_cols + i] = i;
+    }
+  }
+}
+
+template <typename T>
+struct Pair {
+  __device__ __forceinline__ Pair() {}
+  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
+
+  __device__ __forceinline__ void set(T value, int64_t id) {
+    v = value;
+    id = id;
+  }
+
+  __device__ __forceinline__ void operator=(const Pair<T>& in) {
+    v = in.v;
+    id = in.id;
+  }
+
+  __device__ __forceinline__ bool operator<(const T value) const {
+    return (v < value);
+  }
+
+  __device__ __forceinline__ bool operator>(const T value) const {
+    return (v > value);
+  }
+  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
+    return (v < in.v) || ((v == in.v) && (id > in.id));
+  }
+
+  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
+    return (v > in.v) || ((v == in.v) && (id < in.id));
+  }
+
+  T v;
+  int64_t id;
+};
+
+template <typename T>
+__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
+                                      int beam_size, const bool& largest) {
+  for (int k = beam_size - 2; k >= 0; k--) {
+    if (largest) {
+      if (topk[k] < p) {
+        topk[k + 1] = topk[k];
+      } else {
+        topk[k + 1] = p;
+        return;
+      }
+    } else {
+      if (topk[k] > p) {
+        topk[k + 1] = topk[k];
+      } else {
+        topk[k + 1] = p;
+        return;
+      }
+    }
+  }
+  topk[0] = p;
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, int beam_size,
+                                        const bool& largest) {
+  while (idx < dim) {
+    if (largest) {
+      if (topk[beam_size - 1] < src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        AddTo<T>(topk, tmp, beam_size, largest);
+      }
+    } else {
+      if (topk[beam_size - 1] > src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        AddTo<T>(topk, tmp, beam_size, largest);
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, const Pair<T>& max,
+                                        int beam_size, const bool& largest) {
+  while (idx < dim) {
+    if (largest) {
+      if (topk[beam_size - 1] < src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        if (tmp < max) {
+          AddTo<T>(topk, tmp, beam_size, largest);
+        }
+      }
+    } else {
+      if (topk[beam_size - 1] > src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        if (tmp > max) {
+          AddTo<T>(topk, tmp, beam_size, largest);
+        }
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
+                                              int beam_size, const T* src,
+                                              bool* firstStep, bool* is_empty,
+                                              Pair<T>* max, int dim,
+                                              const int tid, bool largest) {
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
+      GetTopK<T, BlockSize>(topk, src, tid, dim, length, largest);
+    } else {
+      for (int k = 0; k < MaxLength; k++) {
+        if (k < MaxLength - (*beam)) {
+          topk[k] = topk[k + *beam];
+        } else {
+          topk[k].set(-static_cast<T>(INFINITY), -1);
+        }
+      }
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
+                              length, largest);
+      }
+    }
+
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
+    *beam = 0;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
+                                            Pair<T> topk[], T** topVal,
+                                            int64_t** topIds, int* beam, int* k,
+                                            const int tid, const int warp,
+                                            const bool& largest) {
+  while (true) {
+    __syncthreads();
+    if (tid < BlockSize / 2) {
+      if (largest) {
+        if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
+          maxid[tid] = tid + BlockSize / 2;
+        } else {
+          maxid[tid] = tid;
+        }
+      } else {
+        if (sh_topk[tid] > sh_topk[tid + BlockSize / 2]) {
+          maxid[tid] = tid + BlockSize / 2;
+        } else {
+          maxid[tid] = tid;
+        }
+      }
+    }
+    __syncthreads();
+    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
+      if (tid < stride) {
+        if (largest) {
+          if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
+            maxid[tid] = maxid[tid + stride];
+          }
+        } else {
+          if (sh_topk[maxid[tid]] > sh_topk[maxid[tid + stride]]) {
+            maxid[tid] = maxid[tid + stride];
+          }
+        }
+      }
+      __syncthreads();
+    }
+    __syncthreads();
+
+    if (tid == 0) {
+      **topVal = sh_topk[maxid[0]].v;
+      **topIds = sh_topk[maxid[0]].id;
+      (*topVal)++;
+      (*topIds)++;
+    }
+    if (tid == maxid[0]) (*beam)++;
+    if (--(*k) == 0) break;
+    __syncthreads();
+
+    if (tid == maxid[0]) {
+      if (*beam < MaxLength) {
+        sh_topk[tid] = topk[*beam];
+      }
+    }
+    // NOTE(zcd): temporary solution
+    unsigned mask = 0u;
+    CREATE_SHFL_MASK(mask, true);
+
+    if (maxid[0] / 32 == warp) {
+      if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
+          MaxLength)
+        break;
+    }
+  }
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top MaxLength value;
+ * 2. merge to sh_topk, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topk value is null;
+ * 4. go to the first setp, until get the topk value.
+ */
+
+template <typename T, int MaxLength, int BlockSize>
+__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
+                             const T* src, int lds, int dim, int k,
+                             int grid_dim, int num, bool largest = true) {
+  __shared__ Pair<T> sh_topk[BlockSize];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+
+  const int bid = blockIdx.x;
+  for (int i = bid; i < num; i += grid_dim) {
+    int top_num = k;
+    __shared__ int maxid[BlockSize / 2];
+    T* out = output + i * output_stride;
+    int64_t* inds = indices + i * k;
+    Pair<T> topk[MaxLength];
+    int beam = MaxLength;
+    Pair<T> max;
+    bool is_empty = false;
+    bool firststep = true;
+
+    for (int j = 0; j < MaxLength; j++) {
+      if (largest) {
+        topk[j].set(-static_cast<T>(INFINITY), -1);
+      } else {
+        topk[j].set(static_cast<T>(INFINITY), -1);
+      }
+    }
+    while (top_num) {
+      ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k, src + i * lds,
+                                             &firststep, &is_empty, &max, dim,
+                                             tid, largest);
+
+      sh_topk[tid] = topk[0];
+      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
+                                           &beam, &top_num, tid, warp, largest);
+    }
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
+                           size_t rows, size_t cols, size_t k) {
+  for (size_t i = 0; i < rows; ++i) {
+    for (size_t j = 0; j < cols; ++j) {
+      x_grad[i * cols + j] = 0;
+    }
+    for (size_t j = 0; j < k; ++j) {
+      size_t idx = indices[i * k + j];
+      x_grad[i * cols + idx] = out_grad[i * k + j];
+    }
+  }
+}
+
+// the grad assign with the axis
+template <typename T>
+__global__ void AssignGradWithAxis(const T* grad_out, const int64_t* indices,
+                                   T* grad_in, int pre, int post,
+                                   int raw_height, int k) {
+  // raw_height is the length of topk axis
+  for (int i = blockIdx.x; i < pre; i += gridDim.x) {
+    const int& base_index = i * post * k;
+    const int& base_grad = i * post * raw_height;
+    for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) {
+      grad_in[base_grad + j] = static_cast<T>(0);
+    }
+    for (int j = threadIdx.x; j < k * post; j += blockDim.x) {
+      const int64_t idx_ij = indices[base_index + j];
+      const int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
+      grad_in[in_ij] = grad_out[idx_ij];
+    }
+  }
+}
+// use the radix sort for the topk
+template <typename T>
+bool SortTopk(const platform::CUDADeviceContext& ctx,
+              const framework::Tensor* input_tensor, const int64_t num_cols,
+              const int64_t num_rows, const int k,
+              framework::Tensor* out_tensor, framework::Tensor* indices_tensor,
+              bool largest = true) {
+  auto cu_stream = ctx.stream();
+
+  Tensor input_indices;
+  const std::vector<int64_t> dims = {num_rows, num_cols};
+  auto dim = framework::make_ddim(dims);
+  input_indices.Resize(dim);
+  // input_indices.Resize(num_rows*num_cols);
+  input_indices.mutable_data<int64_t>(ctx.GetPlace());
+  size_t temp_storage_bytes = -1;
+
+  auto ComputeBlockSize = [](int col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+  int block_size = ComputeBlockSize(num_cols);
+
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  // actually, int num_rows < max_grid_size
+  unsigned int grid_size = num_rows < maxGridDimX
+                               ? static_cast<unsigned int>(num_rows)
+                               : maxGridDimX;
+  // Init a index array
+  InitIndex<int64_t><<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<int64_t>(), num_rows, num_cols);
+
+  // create iter for counting input
+  cub::CountingInputIterator<int64_t> counting_iter(0);
+  // segment_offset is used for move to next row
+  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
+                              cub::CountingInputIterator<int64_t>>
+      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
+
+  T* sorted_values_ptr;
+  int64_t* sorted_indices_ptr;
+
+  Tensor temp_values;
+  Tensor temp_indices;
+
+  const T* input = input_tensor->data<T>();
+  T* values = out_tensor->data<T>();
+  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
+
+  if (k == num_cols) {
+    // Doing a full sort.
+    sorted_values_ptr = values;
+    sorted_indices_ptr = indices;
+  } else {
+    temp_values.Resize(dim);
+    temp_indices.Resize(dim);
+    sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
+    sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
+  }
+
+  // Get temp storage buffer size, maybe can allocate a fixed buffer to save
+  // time.
+  if (largest) {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        nullptr, temp_storage_bytes, input, sorted_values_ptr,
+        input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
+        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
+        cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR)
+          << "TopKOP failed as could not launch "
+             "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
+             "temp_storage_bytes, status: "
+          << cudaGetErrorString(err);
+      return false;
+    }
+  } else {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairs(
+        nullptr, temp_storage_bytes, input, sorted_values_ptr,
+        input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
+        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
+        cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to calculate "
+                    "temp_storage_bytes, status: "
+                 << cudaGetErrorString(err);
+      return false;
+    }
+  }
+  Tensor temp_storage;
+  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
+
+  if (largest) {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        temp_storage.data<uint8_t>(), temp_storage_bytes, input,
+        sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
+        num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
+        0, sizeof(T) * 8, cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairsDescending to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << cudaGetErrorString(err);
+      return false;
+    }
+  } else {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairs(
+        temp_storage.data<uint8_t>(), temp_storage_bytes, input,
+        sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
+        num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
+        0, sizeof(T) * 8, cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << cudaGetErrorString(err);
+      return false;
+    }
+  }
+  auto& dev = *ctx.eigen_device();
+  if (k < num_cols) {
+    // copy sliced data to output.
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
+    auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
+    auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
+
+    std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
+    auto dim = framework::make_ddim(odims);
+    auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
+    auto e_tmp_values = EigenMatrix<T>::From(temp_values);
+
+    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
+    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
+  }
+  return true;
+}
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 82ecc2887ba240560cf15165f21bc995f4683159..d8b2e92616091a8c822c6fd0bfdfb1148c25534d 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -12,474 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
 #include <cstdio>
+#include <vector>
 #include "cub/cub.cuh"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/float16.h"
 // set cub base traits in order to handle float16
-namespace cub {
-template <>
-struct NumericTraits<paddle::platform::float16>
-    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
-                 paddle::platform::float16> {};
-}  // namespace cub
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T>
-struct Pair {
-  __device__ __forceinline__ Pair() {}
-  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
-
-  __device__ __forceinline__ void set(T value, int64_t id) {
-    v = value;
-    id = id;
-  }
-
-  __device__ __forceinline__ void operator=(const Pair<T>& in) {
-    v = in.v;
-    id = in.id;
-  }
-
-  __device__ __forceinline__ bool operator<(const T value) const {
-    return (v < value);
-  }
-
-  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
-    return (v < in.v) || ((v == in.v) && (id > in.id));
-  }
-
-  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
-    return (v > in.v) || ((v == in.v) && (id < in.id));
-  }
-
-  T v;
-  int64_t id;
-};
-
-template <typename T>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
-                                      int beam_size) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int beam_size>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
-                                              int beam_size, const T* src,
-                                              bool* firstStep, bool* is_empty,
-                                              Pair<T>* max, int dim,
-                                              const int tid) {
-  if (*beam > 0) {
-    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (*firstStep) {
-      *firstStep = false;
-      GetTopK<T, BlockSize>(topk, src, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - (*beam)) {
-          topk[k] = topk[k + *beam];
-        } else {
-          topk[k].set(-static_cast<T>(INFINITY), -1);
-        }
-      }
-      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
-                              length);
-      }
-    }
-
-    *max = topk[MaxLength - 1];
-    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
-    *beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
-                                              int beam_size, const T* val,
-                                              int* col, bool* firstStep,
-                                              bool* is_empty, Pair<T>* max,
-                                              int dim, const int tid) {
-  if (*beam > 0) {
-    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (*firstStep) {
-      *firstStep = false;
-      GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - *beam) {
-          topk[k] = topk[k + *beam];
-        } else {
-          topk[k].set(-static_cast<T>(INFINITY), -1);
-        }
-      }
-      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
-                              length);
-      }
-    }
-
-    *max = topk[MaxLength - 1];
-    if ((*max).v == -1) *is_empty = true;
-    *beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
-                                            Pair<T> topk[], T** topVal,
-                                            int64_t** topIds, int* beam, int* k,
-                                            const int tid, const int warp) {
-  while (true) {
-    __syncthreads();
-    if (tid < BlockSize / 2) {
-      if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
-        maxid[tid] = tid + BlockSize / 2;
-      } else {
-        maxid[tid] = tid;
-      }
-    }
-    __syncthreads();
-    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
-      if (tid < stride) {
-        if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
-          maxid[tid] = maxid[tid + stride];
-        }
-      }
-      __syncthreads();
-    }
-    __syncthreads();
-
-    if (tid == 0) {
-      **topVal = sh_topk[maxid[0]].v;
-      **topIds = sh_topk[maxid[0]].id;
-      (*topVal)++;
-      (*topIds)++;
-    }
-    if (tid == maxid[0]) (*beam)++;
-    if (--(*k) == 0) break;
-    __syncthreads();
-
-    if (tid == maxid[0]) {
-      if (*beam < MaxLength) {
-        sh_topk[tid] = topk[*beam];
-      }
-    }
-    // NOTE(zcd): temporary solution
-    unsigned mask = 0u;
-    CREATE_SHFL_MASK(mask, true);
-
-    if (maxid[0] / 32 == warp) {
-      if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
-          MaxLength)
-        break;
-    }
-  }
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top MaxLength value;
- * 2. merge to sh_topk, block reduce and get max value;
- * 3. go to the second setp, until one thread's topk value is null;
- * 4. go to the first setp, until get the topk value.
- */
-
-template <typename T, int MaxLength, int BlockSize>
-__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
-                             const T* src, int lds, int dim, int k,
-                             int grid_dim, int num) {
-  __shared__ Pair<T> sh_topk[BlockSize];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-
-  const int bid = blockIdx.x;
-  for (int i = bid; i < num; i += grid_dim) {
-    int top_num = k;
-    __shared__ int maxid[BlockSize / 2];
-    T* out = output + i * output_stride;
-    int64_t* inds = indices + i * k;
-    Pair<T> topk[MaxLength];
-    int beam = MaxLength;
-    Pair<T> max;
-    bool is_empty = false;
-    bool firststep = true;
-
-    for (int j = 0; j < MaxLength; j++) {
-      topk[j].set(-static_cast<T>(INFINITY), -1);
-    }
-    while (top_num) {
-      ThreadGetTopK<T, MaxLength, BlockSize>(
-          topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
-
-      sh_topk[tid] = topk[0];
-      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
-                                           &beam, &top_num, tid, warp);
-    }
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
-                           size_t rows, size_t cols, size_t k) {
-  for (size_t i = 0; i < rows; ++i) {
-    for (size_t j = 0; j < cols; ++j) {
-      x_grad[i * cols + j] = 0;
-    }
-    for (size_t j = 0; j < k; ++j) {
-      size_t idx = indices[i * k + j];
-      x_grad[i * cols + idx] = out_grad[i * k + j];
-    }
-  }
-}
-
-inline static int GetDesiredBlockDim(int dim) {
-  if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
-  }
-}
-
-// Iter for move to next row
-struct SegmentOffsetIter {
-  EIGEN_DEVICE_FUNC
-  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
-    return idx * num_cols_;
-  }
-
-  int num_cols_;
-};
-
-// Iter using into a column
-struct ColumnIndexIter {
-  explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
-      const Eigen::array<int, 1>& ix) const {
-    return ix[0] % num_cols_;
-  }
-
-  int num_cols_;
-};
-
-__global__ void InitIndex(int64_t* indices, int64_t num_rows,
-                          int64_t num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (int64_t j = row_id; j < num_rows; j += gridDim.x) {
-    for (int64_t i = col_id; i < num_cols; i += blockDim.x) {
-      indices[j * num_cols + i] = i;
-    }
-  }
-}
-
-template <typename T>
-bool SortTopk(const platform::CUDADeviceContext& ctx,
-              const framework::Tensor* input_tensor, const int64_t num_cols,
-              const int64_t num_rows, const int k,
-              framework::Tensor* out_tensor,
-              framework::Tensor* indices_tensor) {
-  auto cu_stream = ctx.stream();
-
-  Tensor input_indices;
-  const std::vector<int64_t> dims = {num_rows, num_cols};
-  auto dim = framework::make_ddim(dims);
-  input_indices.Resize(dim);
-  // input_indices.Resize(num_rows*num_cols);
-  input_indices.mutable_data<int64_t>(ctx.GetPlace());
-  size_t temp_storage_bytes = -1;
-
-  auto ComputeBlockSize = [](int col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
-  // actually, int num_rows < max_grid_size
-  unsigned int grid_size = num_rows < maxGridDimX
-                               ? static_cast<unsigned int>(num_rows)
-                               : maxGridDimX;
-  // Init a index array
-  InitIndex<<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<int64_t>(), num_rows, num_cols);
-
-  // create iter for counting input
-  cub::CountingInputIterator<int64_t> counting_iter(0);
-  // segment_offset is used for move to next row
-  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
-                              cub::CountingInputIterator<int64_t>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-
-  T* sorted_values_ptr;
-  int64_t* sorted_indices_ptr;
-
-  Tensor temp_values;
-  Tensor temp_indices;
-
-  const T* input = input_tensor->data<T>();
-  T* values = out_tensor->data<T>();
-  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
-
-  if (k == num_cols) {
-    // Doing a full sort.
-    sorted_values_ptr = values;
-    sorted_indices_ptr = indices;
-  } else {
-    temp_values.Resize(dim);
-    temp_indices.Resize(dim);
-    sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
-    sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
-  }
-
-  // Get temp storage buffer size, maybe can allocate a fixed buffer to save
-  // time.
-  auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      nullptr, temp_storage_bytes, input, sorted_values_ptr,
-      input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
-      num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-      cu_stream);
-  if (err != cudaSuccess) {
-    LOG(ERROR)
-        << "TopKOP failed as could not launch "
-           "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
-           "temp_storage_bytes, status: "
-        << cudaGetErrorString(err);
-    return false;
-  }
-  Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      temp_storage.data<uint8_t>(), temp_storage_bytes, input,
-      sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
-      num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
-      0, sizeof(T) * 8, cu_stream);
-  if (err != cudaSuccess) {
-    LOG(ERROR)
-        << "TopKOP failed as could not launch "
-           "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
-           "temp_storage_bytes: "
-        << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
-    return false;
-  }
-  auto& dev = *ctx.eigen_device();
-  if (k < num_cols) {
-    // copy sliced data to output.
-    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
-    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
-    auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
-    auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
-
-    std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
-    auto dim = framework::make_ddim(odims);
-    auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
-    auto e_tmp_values = EigenMatrix<T>::From(temp_values);
-
-    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
-    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
-  }
-  return true;
-}
-
 #define FIXED_BLOCK_DIM_BASE(dim, ...) \
   case (dim): {                        \
     constexpr auto kBlockDim = (dim);  \
@@ -523,7 +70,6 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
         framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
     const int64_t input_width = inputdims[inputdims.size() - 1];
     const auto& dev_ctx = ctx.cuda_device_context();
-
     if ((input_width <= 1024 || k >= 128 || k == input_width)) {
       if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
                       indices)) {
@@ -576,7 +122,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
         framework::product(framework::slice_ddim(xdims, 0, xdims.size() - 1));
     const size_t col = xdims[xdims.size() - 1];
     const auto& dev_ctx = context.cuda_device_context();
-
     const int kMaxHeight = 2048;
     int gridx = row < kMaxHeight ? row : kMaxHeight;
     switch (GetDesiredBlockDim(col)) {
@@ -595,7 +140,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
 
 }  // namespace operators
 }  // namespace paddle
-
 REGISTER_OP_CUDA_KERNEL(
     top_k,
     paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc72d83411f5a34561a75e7e75f98077ee5a4e5d
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -0,0 +1,176 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/top_k_v2_op.h"
+#include <memory>
+
+namespace paddle {
+namespace operators {
+
+class TopkV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
+                   "Output(Indices) of TopkOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("X");
+    const int& dim_size = input_dims.size();
+    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
+    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
+    PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true,
+                      "the axis of topk"
+                      "must be [-%d, %d), but you set axis is %d",
+                      dim_size, dim_size, axis);
+
+    if (axis < 0) axis += dim_size;
+
+    PADDLE_ENFORCE_GE(
+        k, 1, "the attribute of k in the topk must >= 1, but received %d .", k);
+    PADDLE_ENFORCE_GE(input_dims.size(), 1,
+                      "input of topk must have >= 1d shape");
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_GE(
+          input_dims[axis], k,
+          "input of topk op must have >= %d columns in axis of %d", k, axis);
+    }
+
+    framework::DDim dims = input_dims;
+
+    dims[axis] = k;
+    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Indices", dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context(),
+        layout_, library_);
+  }
+};
+
+class TopkV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input of Topk op");
+    AddInput("K",
+             "(Tensor)  Number of top elements to look for along "
+             "the last dimension (along each row for matrices).")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
+    AddComment(R"DOC(
+Top K operator
+
+If the input is a vector (1d tensor), this operator finds the k largest 
+entries in the vector and outputs their values and indices as vectors. 
+Thus values[j] is the j-th largest entry in input, and its index is indices[j].
+
+For matrices, this operator computes the top k entries in each row. )DOC");
+    AddAttr<int>("k",
+                 "(int, default 1) Number of top elements to look for along "
+                 "the tensor).")
+        .SetDefault(1);
+    AddAttr<int>("axis",
+                 "the axis to sort and get the k indices, value."
+                 "if not set, will get k value in last axis.")
+        .SetDefault(-1);
+    AddAttr<bool>("largest",
+                  "control flag whether to return largest or smallest")
+        .SetDefault(true);
+    AddAttr<bool>("sorted",
+                  "control flag whether to return elements in sorted order")
+        .SetDefault(true);
+  }
+};
+
+class TopkV2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"), true,
+        platform::errors::InvalidArgument("Input(X) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Indices"), true,
+        platform::errors::InvalidArgument("Input(Indices) should be not null"));
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
+                      platform::errors::InvalidArgument(
+                          "Grad Input(Out) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput(framework::GradVarName("X")), true,
+        platform::errors::InvalidArgument("Grad Output(X) should be not null"));
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+template <typename T>
+class TopkV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("top_k_v2_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Indices", this->Output("Indices"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
+                  ops::TopkV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
+
+REGISTER_OP_CPU_KERNEL(top_k_v2,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, float>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, double>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int32_t>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int64_t>)
+
+REGISTER_OP_CPU_KERNEL(
+    top_k_v2_grad, ops::TopkV2GradKernel<paddle::platform::CPUPlace, float>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, double>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int32_t>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int64_t>)
diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2c94dca1e3a461a44b98e9acf604cc4b488b5fd7
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op.cu
@@ -0,0 +1,271 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/fluid/operators/top_k_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+#define FIXED_BLOCK_DIM_BASE(dim, ...) \
+  case (dim): {                        \
+    constexpr auto kBlockDim = (dim);  \
+    __VA_ARGS__;                       \
+  } break
+
+#define FIXED_BLOCK_DIM(...)                \
+  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
+
+template <typename DeviceContext, typename T>
+class TopkV2OpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
+
+    // get the attributes
+    int k = static_cast<int>(ctx.Attr<int>("k"));
+    int axis = static_cast<int>(ctx.Attr<int>("axis"));
+    const bool& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
+    const bool& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
+
+    // get the input dims
+    const auto& in_dims = input->dims();
+    // calcluate the real axis
+    if (axis < 0) axis += in_dims.size();
+
+    auto* k_t = ctx.Input<Tensor>("K");
+    if (k_t) {
+      Tensor k_host;
+      framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
+      k = k_host.data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      output_dims[axis] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+
+    const auto& out_dims = output->dims();
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    if (axis == in_dims.size() - 1) {
+      // if get the topK from the last axis
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      const auto& dev_ctx = ctx.cuda_device_context();
+
+      if (k > input_width) k = input_width;
+
+      if ((input_width <= 1024 || k >= 128 || k == input_width)) {
+        if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
+                        indices, largest)) {
+          // Successed, return.
+          return;
+        } else {
+          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                       "default topk kernel.";
+        }
+      }
+
+      // NOTE: pass lds and dim same to input width.
+      // NOTE: old matrix implementation of stride is different to eigen.
+      const int kMaxHeight = 2048;
+      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+      switch (GetDesiredBlockDim(input_width)) {
+        FIXED_BLOCK_DIM(
+            KeMatrixTopK<T, 5,
+                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+                output_data, k, indices_data, input_data, input_width,
+                input_width, static_cast<int>(k), gridx, input_height,
+                largest));
+        default:
+          PADDLE_THROW(platform::errors::Fatal(
+              "the input data shape has error in the topk cuda kernel."));
+      }
+    } else {
+      // if get topK not from the last axis, will tranpose the tensor and get
+      // TopK
+
+      // first step, prepare the trans args for the tranpose
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(output->dims());
+      for (int i = 0; i < trans.size(); i++) {
+        trans_dims[i] = in_dims[trans[i]];
+        trans_out_dims[i] = out_dims[trans[i]];
+      }
+      // second step, tranpose the input
+      Tensor trans_input;
+      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
+      int ndims = trans.size();
+      const auto& dev_ctx = ctx.cuda_device_context();
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
+                                                   &trans_input, trans);
+      // third step, calcluate the topk
+      // allocate the tmp cuda memory for the tmp result
+      Tensor trans_ind;
+      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
+      Tensor trans_out;
+      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+      if (k > input_width) k = input_width;
+
+      if ((input_width <= 1024 || k >= 128 || k == input_width)) {
+        if (SortTopk<T>(dev_ctx, &trans_input, input_width, input_height, k,
+                        &trans_out, &trans_ind, largest)) {
+          // last step, tranpose back the indices and output
+          TransCompute<platform::CUDADeviceContext, int64_t>(
+              ndims, dev_ctx, trans_ind, indices, trans);
+          TransCompute<platform::CUDADeviceContext, T>(
+              ndims, dev_ctx, trans_out, output, trans);
+          return;
+        } else {
+          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                       "default topk kernel.";
+        }
+      }
+
+      const int kMaxHeight = 2048;
+      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+      switch (GetDesiredBlockDim(input_width)) {
+        FIXED_BLOCK_DIM(
+            KeMatrixTopK<T, 5,
+                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
+                trans_input.data<T>(), input_width, input_width,
+                static_cast<int>(k), gridx, input_height, largest));
+        default:
+          PADDLE_THROW(platform::errors::Fatal(
+              "the input data shape has error in the topk cuda kernel."));
+      }
+
+      // last step, tranpose back the indices and output
+      TransCompute<platform::CUDADeviceContext, int64_t>(
+          ndims, dev_ctx, trans_ind, indices, trans);
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
+                                                   output, trans);
+    }
+  }
+};
+
+#undef FIXED_BLOCK_DIM_BASE
+#undef FIXED_BLOCK_DIM
+template <typename DeviceContext, typename T>
+class TopkV2OpGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::InvalidArgument("It must use CUDAPlace."));
+    auto* x = context.Input<Tensor>("X");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<Tensor>("Indices");
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int axis = context.Attr<int>("axis");
+
+    const auto& in_dims = x->dims();
+    const auto& out_dims = indices->dims();
+
+    // get the real the axis and the k
+    if (axis < 0) axis += in_dims.size();
+    const int& k = out_dims[axis];
+    const int& raw_height = in_dims[axis];
+
+    // allocate the cuda memory for the x_grad
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    const T* out_grad_data = out_grad->data<T>();
+    const int64_t* indices_data = indices->data<int64_t>();
+
+    int pre, n, post;
+    GetDims(in_dims, axis, &pre, &n, &post);
+
+    // calcluate the block and grid num
+    auto& dev_ctx = context.cuda_device_context();
+    auto ComputeBlockSize = [](int col) {
+      if (col > 512)
+        return 1024;
+      else if (col > 256 && col <= 512)
+        return 512;
+      else if (col > 128 && col <= 256)
+        return 256;
+      else if (col > 64 && col <= 128)
+        return 128;
+      else
+        return 64;
+    };
+    int block_size = ComputeBlockSize(post * k);
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+    int grid_size = std::min(max_blocks, pre);
+
+    // lanuch the cuda kernel to assign the grad
+    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+        out_grad_data, indices_data, x_grad_data, pre, post, n, k);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    top_k_v2,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          float>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          double>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          int>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          paddle::platform::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel<
+                       paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, int>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, int64_t>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..89b5d36b1b3f915e8719c8791e8c12c2e0348f26
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op.h
@@ -0,0 +1,334 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+  The reason why we need the topk v2 is because the compatibility. We redefine
+  the NaN is maximum value
+  in the process of comparing. If do not add the topk v2,  will affect the
+  inference result of model that traing
+  by the older version paddlepaddle.
+*/
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/operators/transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
+                    int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
+template <typename T, typename Type>
+static void FullTopK(Type input_height, Type input_width, int input_dim,
+                     const framework::Tensor* input, T* t_out, Type* t_indices,
+                     const int& k, const bool& largest, const bool& sorted) {
+  // when the k is small, will the partial sort
+  bool partial_sort_flag = (k * 64) < input_width;
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  // Eigen::DSizes<int, 2> flat2dims(input_height, input_width);
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
+          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            if (largest) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            } else {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            }
+          });
+    } else {
+      // use the nth-element to get the K-larger or K-small element
+      if (largest) {
+        std::nth_element(
+            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(col_vec.begin(), col_vec.begin() + k - 1,
+                    [&largest](const std::pair<T, Type>& l,
+                               const std::pair<T, Type>& r) {
+                      return (std::isnan(static_cast<double>(l.first)) &&
+                              !std::isnan(static_cast<double>(r.first))) ||
+                             (l.first > r.first);
+                    });
+        }
+      } else {
+        std::nth_element(
+            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(
+              col_vec.begin(), col_vec.begin() + k - 1,
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+        }
+      }
+    }
+    for (Type j = 0; j < k; ++j) {
+      t_out[i * k + j] = col_vec[j].first;
+      t_indices[i * k + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Type>
+static void FullTopKAssign(const Type& input_height, const Type& input_width,
+                           const int& input_dim, const framework::Tensor* input,
+                           const framework::Tensor* indices, T* output_data,
+                           const int& k) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class TopkV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // Get the top k elements of each row of input tensor
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    auto* indices = context.Output<Tensor>("Indices");
+    const auto& in_dims = input->dims();
+    int k = static_cast<int>(context.Attr<int>("k"));
+    const auto& sorted = static_cast<bool>(context.Attr<bool>("sorted"));
+    const auto& largest = static_cast<bool>(context.Attr<bool>("largest"));
+
+    // axis < 0, cacluate the real axis
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+    if (axis < 0) axis += in_dims.size();
+
+    // if K tensor is not null, will the use K tesnor as k
+    auto* k_t = context.Input<Tensor>("K");
+    if (k_t) {
+      k = k_t->data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      // accroding to axis to set K value in the dim
+      output_dims[axis] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
+    const auto& out_dims = output->dims();
+    if (axis + 1 == in_dims.size()) {
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(), input,
+                           output_data, indices_data, k, largest, sorted);
+    } else {
+      // if the topk dims is not last dim, will tranpose and do topk
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.push_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+
+      // get the trans input_dims, out_dims
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(output->dims());
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_dims[i] = in_dims[trans[i]];
+      }
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_out_dims[i] = out_dims[trans[i]];
+      }
+
+      Tensor trans_inp;
+      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
+      int ndims = trans.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+
+      // transpose the input value
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
+                                                  &trans_inp, trans);
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+      // Allocate the temp tensor to the save the topk indices, values
+      Tensor tmp_out;
+      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
+      Tensor tmp_indices;
+      auto* t_ind =
+          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
+
+      // get the TopK value
+      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(),
+                           &trans_inp, t_out, t_ind, k, largest, sorted);
+      // transpose back
+      TransCompute<platform::CPUDeviceContext, int64_t>(
+          ndims, dev_context, tmp_indices, indices, trans);
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  output, trans);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TopkV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<Tensor>("Indices");
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+
+    const auto& in_dims = x->dims();
+    const auto& out_dims = indices->dims();
+
+    // axis < 0, get the real axis
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+    const size_t& k = out_dims[axis];
+
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    if (axis + 1 == in_dims.size()) {
+      // allocate the memory for the input_grad
+
+      // assign the out_grad to input_grad directly
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t input_width = in_dims[in_dims.size() - 1];
+
+      // init the output grad with 0, because some input elements has no grad
+      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+      // Assign the output_grad to input_grad
+      FullTopKAssign(input_height, input_width, in_dims.size(), out_grad,
+                     indices, x_grad_data, k);
+    } else {
+      // can not assign grad to input_grad, must do the transpose
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(out_dims.size() - 1);
+      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+      framework::DDim trans_dims(out_dims);
+      framework::DDim trans_in_dims(in_dims);
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_dims[i] = out_dims[trans[i]];
+        trans_in_dims[i] = in_dims[trans[i]];
+      }
+      // transpose the out_grad, indices
+      Tensor trans_dO;
+      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
+      Tensor trans_ind;
+      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
+      int ndims = trans.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+
+      // Do transpose
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *out_grad,
+                                                  &trans_dO, trans);
+      TransCompute<platform::CPUDeviceContext, int64_t>(
+          ndims, dev_context, *indices, &trans_ind, trans);
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+
+      // Assign the out_grad to tranpose input_grad
+      Tensor tmp_out;
+      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
+      memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+      FullTopKAssign<T, int64_t>(input_height, input_width, in_dims.size(),
+                                 &trans_dO, &trans_ind, t_out, k);
+
+      // Transpose back
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  x_grad, trans);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index 9e158abba747d124c83e0366b9c0c5845c49e183..419f0f7a2a57822b422986f9b676de138a7404cd 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include <limits>
 #include <random>
+
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -161,18 +163,15 @@ class CPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
     std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
                                            1.0);
     TruncatedNormal<T> truncated_normal(mean, std);
     int64_t size = tensor->numel();
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
     for (int64_t i = 0; i < size; ++i) {
-      data[i] = truncated_normal(dist(engine));
+      data[i] = truncated_normal(dist(*engine));
     }
   }
 };
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index a4487cde277990a725fd4c37b6d807278e314343..9cffe09a33abf29308072d6b3c8bfb8a636048da 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/uniform_random_op.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -62,34 +64,12 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
     std::uniform_real_distribution<T> dist(
         static_cast<T>(ctx.Attr<float>("min")),
         static_cast<T>(ctx.Attr<float>("max")));
-    auto gen_ptr = framework::Generator::GetInstance();
-    if (gen_ptr->is_init_py) {
-      std::mt19937_64 &gen_engine = gen_ptr->GetCPUEngine();
-      // auto gen_engine = gen_ptr_->GetCPUEngine();
-      // std::uniform_real_distribution<T> dist(
-      //    static_cast<T>(ctx.Attr<float>("min")),
-      //    static_cast<T>(ctx.Attr<float>("max")));
+    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
 
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(gen_engine);
-      }
-    } else {
-      unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-      std::minstd_rand engine;
-      if (seed == 0) {
-        seed = std::random_device()();
-      }
-      engine.seed(seed);
-      // std::uniform_real_distribution<T> dist(
-      //    static_cast<T>(ctx.Attr<float>("min")),
-      //    static_cast<T>(ctx.Attr<float>("max")));
-      // int64_t size = tensor->numel();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
     }
-    // std::mt19937_64 &engine = gen_ptr->GetCPUEngine();
-    // auto engine = gen_ptr_->GetCPUEngine();
 
     unsigned int diag_num =
         static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
@@ -139,12 +119,12 @@ class UniformRandomOp : public framework::OperatorWithKernel {
     if (ctx->HasInputs("ShapeTensorList")) {
       // top prority shape
       auto inputs_name = ctx->Inputs("ShapeTensorList");
-      PADDLE_ENFORCE_GT(
-          inputs_name.size(), 0,
-          platform::errors::InvalidArgument(
-              "Input(ShapeTensorList)'size of Op(uniform_random) can't be zero."
-              "Please check the Attr(shape)'s size of"
-              "Op(fluid.layers.uniform_random).)"));
+      PADDLE_ENFORCE_GT(inputs_name.size(), 0,
+                        platform::errors::InvalidArgument(
+                            "Input(ShapeTensorList)'size of "
+                            "Op(uniform_random) can't be zero."
+                            "Please check the Attr(shape)'s size of"
+                            "Op(fluid.layers.uniform_random).)"));
       auto out_dims = std::vector<int>(inputs_name.size(), -1);
       ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
 
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index c024bb87b09c00c34dbaaf7b747f29743152502f..4df1e0ffeb97564803f452114d52ab03d0464f8a 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -88,15 +89,12 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    if (framework::Generator::GetInstance()->is_init_py) {
-      seed = static_cast<unsigned int>(
-          framework::Generator::GetInstance()->GetCurrentSeed());
-    } else {
-      if (seed == 0) {
-        std::random_device rd;
-        seed = rd();
-      }
+
+    if (seed == 0) {
+      std::random_device rd;
+      seed = rd();
     }
+
     T min = static_cast<T>(context.Attr<float>("min"));
     T max = static_cast<T>(context.Attr<float>("max"));
     unsigned int diag_num =
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
index c141033b2b3e6b9fdeac88610dd1362ba8f98428..745102dd28d3d578ec3674221645fc1e8bdfe43a 100644
--- a/paddle/fluid/operators/unique_op.cc
+++ b/paddle/fluid/operators/unique_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unique_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -24,17 +25,63 @@ class UniqueOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique");
-    OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
-
     auto in_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(), 1,
-        platform::errors::InvalidArgument("The Input(X) should be 1-D Tensor, "
-                                          "But now the dims of Input(X) is %d.",
-                                          in_dims.size()));
+    if (!ctx->Attrs().Get<bool>("is_sorted")) {
+      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
+      PADDLE_ENFORCE_EQ(in_dims.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "The Input(X) should be 1-D Tensor, "
+                            "But now the dims of Input(X) is %d.",
+                            in_dims.size()));
+
+      ctx->SetOutputDim("Out", {-1});
+      ctx->SetOutputDim("Index", in_dims);
+      return;
+    }
+
+    bool return_index = ctx->Attrs().Get<bool>("return_index");
+    bool return_inverse = ctx->Attrs().Get<bool>("return_inverse");
+    bool return_counts = ctx->Attrs().Get<bool>("return_counts");
+    auto axis_vec = ctx->Attrs().Get<std::vector<int>>("axis");
+
+    if (return_index) {
+      OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique");
+    }
+    if (return_inverse) {
+      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
+    }
+    if (return_counts) {
+      OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique");
+    }
 
-    ctx->SetOutputDim("Out", {-1});
-    ctx->SetOutputDim("Index", in_dims);
+    if (axis_vec.empty()) {
+      ctx->SetOutputDim("Out", {-1});
+      if (return_inverse) {
+        ctx->SetOutputDim("Index", {framework::product(in_dims)});
+      }
+    } else {
+      int axis = axis_vec[0];
+      if (axis < 0) {
+        axis += in_dims.size();
+      }
+      PADDLE_ENFORCE_LT(
+          axis, in_dims.size(),
+          platform::errors::InvalidArgument("The axis(%d) should be less than "
+                                            "the dimension size(%d) of x.",
+                                            axis, in_dims.size()));
+      auto out_dims = in_dims;
+      out_dims[axis] = -1;
+      ctx->SetOutputDim("Out", out_dims);
+      if (return_inverse) {
+        ctx->SetOutputDim("Index", {in_dims[axis]});
+      }
+    }
+    if (return_index) {
+      ctx->SetOutputDim("Indices", {-1});
+    }
+    if (return_counts) {
+      ctx->SetOutputDim("Counts", {-1});
+    }
   }
 
  protected:
@@ -49,14 +96,47 @@ class UniqueOp : public framework::OperatorWithKernel {
 class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "Input tensor. It should be a 1-D tensor.");
+    AddInput("X",
+             "Input tensor. It should be a 1-D tensor when Attr(is_sorted)"
+             " is fasle or a N-D tensor when Attr(is_sorted) is true.");
     AddAttr<int>("dtype", "data type for output index");
     AddOutput("Out", "A unique subsequence for input tensor.");
     AddOutput("Index",
-              "An index tensor pointing to unique subsequence, which has "
-              "identical shape with input tensor and int64 dtype.");
+              "Equivalent to inverse in numpy.unique, "
+              "the indices for where elements in the original input ended up "
+              "in the returned unique tensor.");
+    AddOutput(
+        "Indices",
+        "The indices of the input tensor that result in the unique tensor.")
+        .AsDispensable();
+    AddOutput("Counts", "The counts for each unique element.").AsDispensable();
+    AddAttr<bool>("return_index",
+                  "If True, also return the indices of the input"
+                  " tensor that result in the unique Tensor.")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "return_inverse",
+        "If True, also return the indices for where elements"
+        " in the original input ended up in the returned unique tensor.")
+        .SetDefault(false);
+    AddAttr<bool>("return_counts",
+                  "If True, also return the counts for each unique element.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>(
+        "axis",
+        "The axis to apply unique. If None, the input will be flattened.")
+        .SetDefault({});
+    AddAttr<bool>("is_sorted",
+                  "If True, the unique elements of X are in ascending order."
+                  "Otherwise, the unique elements are not sorted.")
+        .SetDefault(false);
     AddComment(R"DOC(
-    Return a unique subsequence for 1-D input tensor, and an index tensor pointing to this unique subsequence
+    1. Return a unique subsequence for 1-D input tensor, and an index tensor
+    pointing to this unique subsequence when Attr(is_sorted) is false. This 
+    means paddle.unique is called.
+    
+    2. Returns the unique elements of X in ascending order when Attr(is_sorted)
+    is true. This means fluid.layers.unique is called.
 )DOC");
   }
 };
@@ -65,6 +145,39 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker);
-REGISTER_OP_CPU_KERNEL(unique, ops::UniqueKernel<float>,
-                       ops::UniqueKernel<double>, ops::UniqueKernel<int32_t>,
-                       ops::UniqueKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    unique, ops::UniqueKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int32_t>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_VERSION(unique)
+    .AddCheckpoint(
+        R"ROC(
+        Upgrade unique, add 2 outputs [Indices, Counts] and 5 attribute
+        [return_index, return_inverse, return_counts, axis, is_sorted].
+      )ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewOutput("Indices",
+                       "The indices of the input tensor that result in the "
+                       "unique tensor.")
+            .NewOutput("Counts", "The counts for each unique element.")
+            .NewAttr("return_index",
+                     "If True, also return the indices of the input"
+                     " tensor that result in the unique Tensor.",
+                     false)
+            .NewAttr("return_inverse",
+                     "If True, also return the indices for where elements"
+                     " in the original input ended up in the returned unique "
+                     "tensor.",
+                     false)
+            .NewAttr("return_counts",
+                     "If True, also return the counts for each unique element.",
+                     false)
+            .NewAttr("axis",
+                     "The axis to apply unique. If None, the input will be "
+                     "flattened.",
+                     {})
+            .NewAttr("is_sorted",
+                     "If True, the unique elements of X are in ascending order."
+                     "Otherwise, the unique elements are not sorted.",
+                     false));
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index cdfd797cbfdf87a42ce0834eea9467010a058431..2bd2a2cbf34c6ccba1e6bfd1892f0f821d0f7c72 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include <cmath>
+#include <numeric>
+#include <set>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/transpose_op.h"
 
 namespace paddle {
 namespace operators {
@@ -104,17 +109,313 @@ struct UniqueOpFunctor {
   }
 };
 
+static std::vector<framework::Tensor> Unbind(const framework::Tensor& in) {
+  int64_t size = in.dims()[0];
+  std::vector<framework::Tensor> tensors(size);
+  for (int64_t i = 0; i < size; ++i) {
+    tensors[i] = in.Slice(i, i + 1);
+  }
+  return tensors;
+}
+
 template <typename T>
+static bool Equal(const framework::Tensor& a, const framework::Tensor& b) {
+  if (a.numel() != b.numel()) {
+    return false;
+  }
+  for (int64_t i = 0; i < a.numel(); ++i) {
+    if (a.data<T>()[i] != b.data<T>()[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename InT, typename IndexT>
+static void UniqueFlattendTensor(const framework::ExecutionContext& context,
+                                 const framework::Tensor& in,
+                                 framework::Tensor* out, bool return_index,
+                                 bool return_inverse, bool return_counts) {
+  const InT* in_data = in.data<InT>();
+  std::set<InT> unique(in_data, in_data + in.numel());
+  out->Resize(framework::make_ddim({static_cast<int64_t>(unique.size())}));
+  auto out_data = out->mutable_data<InT>(context.GetPlace());
+  std::copy(unique.begin(), unique.end(), out_data);
+
+  if (return_index) {
+    auto* indices = context.Output<framework::Tensor>("Indices");
+    indices->Resize(framework::make_ddim({out->numel()}));
+    auto indices_data = indices->mutable_data<IndexT>(context.GetPlace());
+    std::unordered_map<InT, IndexT> indices_map;
+    indices_map.reserve(out->numel());
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      if (indices_map.find(in_data[i]) != indices_map.end()) continue;
+      indices_map[in_data[i]] = i;
+    }
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      indices_data[i] = indices_map[out_data[i]];
+    }
+  }
+
+  if (return_inverse) {
+    auto* inverse = context.Output<framework::Tensor>("Index");
+    inverse->Resize(framework::make_ddim({in.numel()}));
+    auto inverse_data = inverse->mutable_data<IndexT>(context.GetPlace());
+    std::unordered_map<InT, IndexT> inverse_map;
+    inverse_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      inverse_map[out_data[i]] = i;
+    }
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      inverse_data[i] = inverse_map[in_data[i]];
+    }
+  }
+
+  if (return_counts) {
+    auto* count = context.Output<framework::Tensor>("Counts");
+    count->Resize(framework::make_ddim({out->numel()}));
+    auto count_data = count->mutable_data<IndexT>(context.GetPlace());
+    std::unordered_map<InT, IndexT> counts_map;
+    counts_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      counts_map[out_data[i]] = 0;
+    }
+    for (int64_t i = 0; i < in.numel(); i++) {
+      counts_map[in_data[i]] += 1;
+    }
+    for (int64_t i = 0; i < out->numel(); i++) {
+      count_data[i] = counts_map[out_data[i]];
+    }
+  }
+}
+
+template <class ForwardIt, typename InT, typename IndexT>
+static ForwardIt UniqueDimImpl(const framework::ExecutionContext& context,
+                               ForwardIt first, ForwardIt last,
+                               const std::vector<IndexT>& sorted_indices_vec,
+                               std::vector<IndexT>* inverse_vec,
+                               std::vector<IndexT>* counts_vec,
+                               std::vector<IndexT>* indices_vec) {
+  if (first == last) {
+    return last;
+  }
+
+  (*inverse_vec)[sorted_indices_vec[0]] = 0;
+  (*counts_vec)[0] = 1;
+  (*indices_vec)[0] = sorted_indices_vec[0];
+
+  ForwardIt begin = first;
+  ForwardIt result = first;
+
+  while (++first != last) {
+    int64_t idx_first = std::distance(begin, first);
+    int64_t idx_result = std::distance(begin, result);
+    if (!Equal<InT>(*result, *first)) {
+      if (++result != first) {
+        *result = std::move(*first);
+      }
+      idx_result += 1;
+      (*indices_vec)[idx_result] = sorted_indices_vec[idx_first];
+    }
+    (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result;
+    (*counts_vec)[idx_result] += 1;
+  }
+  return ++result;
+}
+
+template <typename DeviceContext, typename InT, typename IndexT>
+static void UniqueDim(const framework::ExecutionContext& context,
+                      const framework::Tensor& in, framework::Tensor* out,
+                      bool return_index, bool return_inverse,
+                      bool return_counts, int axis) {
+  // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
+  std::vector<int> permute(in.dims().size());
+  std::iota(permute.begin(), permute.end(), 0);
+  permute[axis] = 0;
+  permute[0] = axis;
+  std::vector<int64_t> in_trans_dims_vec(framework::vectorize(in.dims()));
+  in_trans_dims_vec[axis] = in.dims()[0];
+  in_trans_dims_vec[0] = in.dims()[axis];
+  framework::Tensor in_trans;
+  framework::DDim in_trans_dims = framework::make_ddim(in_trans_dims_vec);
+  in_trans.Resize(in_trans_dims);
+  in_trans.mutable_data<InT>(context.GetPlace());
+  auto& dev_ctx = context.template device_context<DeviceContext>();
+  TransCompute<DeviceContext, InT>(in.dims().size(), dev_ctx, in, &in_trans,
+                                   permute);
+  // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
+  framework::DDim in_trans_flat_dims =
+      framework::flatten_to_2d(in_trans_dims, 1);
+  in_trans.Resize(in_trans_flat_dims);
+
+  // sort indices
+  std::vector<IndexT> sorted_indices_vec(in_trans.dims()[0]);
+  std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0);
+  int64_t col = in_trans.dims()[1];
+  const InT* in_trans_data = in_trans.data<InT>();
+  std::sort(sorted_indices_vec.begin(), sorted_indices_vec.end(),
+            [&](int64_t a, int64_t b) -> bool {
+              for (int64_t i = 0; i < col; ++i) {
+                InT lhs = in_trans_data[i + a * col];
+                InT rhs = in_trans_data[i + b * col];
+                if (lhs < rhs) {
+                  return true;
+                } else if (lhs > rhs) {
+                  return false;
+                }
+              }
+              return false;
+            });
+
+  // sort tensor according to indices
+  framework::Tensor input_sorted;
+  input_sorted.Resize(in_trans_dims);
+  input_sorted.mutable_data<InT>(context.GetPlace());
+  InT* input_sorted_data = input_sorted.data<InT>();
+  for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
+    memcpy(input_sorted_data + i * col,
+           in_trans_data + static_cast<int64_t>(sorted_indices_vec[i]) * col,
+           col * sizeof(InT));
+  }
+
+  std::vector<framework::Tensor> input_unbind = Unbind(input_sorted);
+  std::vector<IndexT> inverse_vec(sorted_indices_vec.size(), 0);
+  std::vector<IndexT> counts_vec(sorted_indices_vec.size(), 0);
+  std::vector<IndexT> indices_vec(sorted_indices_vec.size(), 0);
+  auto last = UniqueDimImpl<std::vector<framework::Tensor>::iterator, InT>(
+      context, input_unbind.begin(), input_unbind.end(), sorted_indices_vec,
+      &inverse_vec, &counts_vec, &indices_vec);
+  input_unbind.erase(last, input_unbind.end());
+  counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
+  indices_vec.erase(indices_vec.begin() + input_unbind.size(),
+                    indices_vec.end());
+
+  math::ConcatFunctor<DeviceContext, InT> concat_functor;
+  framework::Tensor out_trans;
+  std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
+  out_trans_dims_vec[0] = input_unbind.size();
+  out_trans.Resize(framework::make_ddim(out_trans_dims_vec));
+  out_trans.mutable_data<InT>(context.GetPlace());
+  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
+  out->Resize(framework::make_ddim(out_trans_dims_vec));
+  out->mutable_data<InT>(context.GetPlace());
+  concat_functor(dev_ctx, input_unbind, 0, &out_trans);
+  TransCompute<DeviceContext, InT>(out_trans.dims().size(), dev_ctx, out_trans,
+                                   out, permute);
+
+  if (return_inverse) {
+    auto* inverse = context.Output<framework::Tensor>("Index");
+    framework::TensorFromVector(inverse_vec, context.device_context(), inverse);
+  }
+
+  if (return_counts) {
+    auto* count = context.Output<framework::Tensor>("Counts");
+    framework::TensorFromVector(counts_vec, context.device_context(), count);
+  }
+
+  if (return_index) {
+    auto* indices = context.Output<framework::Tensor>("Indices");
+    framework::TensorFromVector(indices_vec, context.device_context(), indices);
+  }
+}
+
+template <typename DeviceContext, typename InT>
+struct UniqueFlattendTensorFunctor {
+  const framework::ExecutionContext& ctx_;
+  const framework::Tensor& in_;
+  framework::Tensor* out_;
+  const bool return_index_;
+  const bool return_inverse_;
+  const bool return_counts_;
+
+  UniqueFlattendTensorFunctor(const framework::ExecutionContext& context,
+                              const framework::Tensor& in,
+                              framework::Tensor* out, bool return_index,
+                              bool return_inverse, bool return_counts)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        return_index_(return_index),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts) {}
+
+  template <typename IndexT>
+  void apply() const {
+    UniqueFlattendTensor<InT, IndexT>(ctx_, in_, out_, return_index_,
+                                      return_inverse_, return_counts_);
+  }
+};
+
+template <typename DeviceContext, typename InT>
+struct UniqueDimFunctor {
+  const framework::ExecutionContext& ctx_;
+  const framework::Tensor& in_;
+  framework::Tensor* out_;
+  const int axis_;
+  const bool return_index_;
+  const bool return_inverse_;
+  const bool return_counts_;
+
+  UniqueDimFunctor(const framework::ExecutionContext& context,
+                   const framework::Tensor& in, framework::Tensor* out,
+                   const int axis, bool return_index, bool return_inverse,
+                   bool return_counts)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        axis_(axis),
+        return_index_(return_index),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts) {}
+
+  template <typename IndexT>
+  void apply() const {
+    UniqueDim<DeviceContext, InT, IndexT>(
+        ctx_, in_, out_, return_index_, return_inverse_, return_counts_, axis_);
+  }
+};
+
+template <typename DeviceContext, typename T>
 class UniqueKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
     auto* x = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
-    auto* index = context.Output<framework::Tensor>("Index");
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+    if (data_type == framework::proto::VarType::INT32) {
+      PADDLE_ENFORCE_LE(
+          x->numel(), INT_MAX,
+          platform::errors::InvalidArgument(
+              "The number of elements in Input(X) should be less than or "
+              "equal to INT_MAX, but received num is %d. Please set `dtype` to "
+              "int64.",
+              x->numel()));
+    }
+    if (!context.Attr<bool>("is_sorted")) {
+      auto* index = context.Output<framework::Tensor>("Index");
 
-    framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
+      framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
+      return;
+    }
+
+    std::vector<int> axis_vec = context.Attr<std::vector<int>>("axis");
+    bool return_index = context.Attr<bool>("return_index");
+    bool return_inverse = context.Attr<bool>("return_inverse");
+    bool return_counts = context.Attr<bool>("return_counts");
+
+    if (axis_vec.empty()) {
+      framework::VisitDataTypeSmall(
+          data_type,
+          UniqueFlattendTensorFunctor<DeviceContext, T>(
+              context, *x, out, return_index, return_inverse, return_counts));
+    } else {
+      int axis = axis_vec[0];
+      framework::VisitDataTypeSmall(
+          data_type, UniqueDimFunctor<DeviceContext, T>(
+                         context, *x, out, axis, return_index, return_inverse,
+                         return_counts));
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/xpu/mul_xpu_op.cc b/paddle/fluid/operators/xpu/mul_xpu_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79aae71c3045f938f4b8f0d3e05ce7cf358c41ea
--- /dev/null
+++ b/paddle/fluid/operators/xpu/mul_xpu_op.cc
@@ -0,0 +1,183 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/operators/mul_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class MulXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    const Tensor* y = context.Input<Tensor>("Y");
+    Tensor* z = context.Output<Tensor>("Out");
+    const Tensor x_matrix =
+        x->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *x, context.template Attr<int>("x_num_col_dims"))
+            : *x;
+    const Tensor y_matrix =
+        y->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *y, context.template Attr<int>("y_num_col_dims"))
+            : *y;
+    z->mutable_data<T>(context.GetPlace());
+    auto z_dim = z->dims();
+    if (z_dim.size() != 2) {
+      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+    }
+    bool trans_a = false;
+    bool trans_b = false;
+    int m = x_matrix.dims()[0];
+    int k = x_matrix.dims()[1];
+    int k1 = y_matrix.dims()[0];
+    int n = y_matrix.dims()[1];
+    PADDLE_ENFORCE_EQ(
+        k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+    T alpha = static_cast<T>(1.0);
+    T beta = static_cast<T>(0.0);
+    const T* data_a = x_matrix.data<T>();
+    const T* data_b = y_matrix.data<T>();
+    T* data_c = z->data<T>();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int ret = xpu::fc_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k,
+                            alpha, data_a, data_b, beta, data_c);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    if (z_dim.size() != 2) {
+      z->Resize(z_dim);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MulGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto x_matrix = x->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                        : static_cast<const Tensor&>(*x);
+    auto y_matrix = y->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                        : static_cast<const Tensor&>(*y);
+    auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    Tensor dout_mat;
+    dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
+                     framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
+    auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+    }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      Tensor dx_matrix = dx->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
+                             : *dx;
+      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
+      // blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
+      bool trans_a = false;
+      bool trans_b = true;
+      int m = dout_mat.dims()[0];
+      int k = dout_mat.dims()[1];
+      int n = y_matrix.dims()[0];
+      int k1 = y_matrix.dims()[1];
+      PADDLE_ENFORCE_EQ(
+          k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+      int lda = (!trans_a) ? k : m;
+      int ldb = (!trans_b) ? n : k;
+      int ldc = n;
+      T alpha = static_cast<T>(1.0);
+      T beta = static_cast<T>(0.0);
+      const T* data_a = dout->data<T>();
+      const T* data_b = y_matrix.data<T>();
+      T* data_c = dx_matrix.data<T>();
+      int ret =
+          xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
+                          data_a, lda, data_b, ldb, beta, data_c, ldc);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU API return wrong value[%d], please check "
+                            "where Baidu Kunlun Card is properly installed.",
+                            ret));
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      Tensor dy_matrix = dy->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
+                             : *dy;
+      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
+      // blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
+      bool trans_a = true;
+      bool trans_b = false;
+      int k = x_matrix.dims()[0];
+      int m = x_matrix.dims()[1];
+      int k1 = dout_mat.dims()[0];
+      int n = dout_mat.dims()[1];
+      PADDLE_ENFORCE_EQ(
+          k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+      int lda = (!trans_a) ? k : m;
+      int ldb = (!trans_b) ? n : k;
+      int ldc = n;
+      T alpha = static_cast<T>(1.0);
+      T beta = static_cast<T>(0.0);
+      const T* data_a = x_matrix.data<T>();
+      const T* data_b = dout->data<T>();
+      T* data_c = dy_matrix.data<T>();
+      int ret =
+          xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
+                          data_a, lda, data_b, ldb, beta, data_c, ldc);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU API return wrong value[%d], please check "
+                            "where Baidu Kunlun Card is properly installed.",
+                            ret));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    mul, ops::MulXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    mul_grad, ops::MulGradXPUKernel<paddle::platform::XPUDeviceContext, float>)
+#endif
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 6a28b975f8e3fc7c7bd56f3a88464cbe4b4c9a72..652b4dd47daa8aecdcae43e8c910d7dd61bbb64d 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -4,6 +4,12 @@ if(WITH_GPU)
   proto_library(cuda_error_proto SRCS cuda_error.proto)
 endif(WITH_GPU)
 
+if(WITH_XPU)
+  set(XPU_CTX_DEPS xpulib)
+ELSE()
+  set(XPU_CTX_DEPS)
+endif(WITH_XPU)
+
 if (WITH_PYTHON)
   py_proto_compile(profiler_py_proto SRCS profiler.proto)
   add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -50,6 +56,10 @@ nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cu
 cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
+if(WITH_XPU)
+cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce)
+endif()
+
 add_subdirectory(dynload)
 add_subdirectory(stream)
 
@@ -78,13 +88,17 @@ ELSE()
   set(STREAM_CALLBACK_DEPS)
 ENDIF()
 
+if(WITH_GLOO)
+    cc_library(gloo_context SRCS gloo_context.cc DEPS framework_proto gloo_wrapper enforce)
+endif()
+
 cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
     place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
-    ${dgc_deps} dlpack cudnn_workspace_helper)
+    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
 
 cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto  device_context enforce)
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 38b0894c3f71dc150a9ed737b0ac17b22baffb8a..29982c13c8ca88bc8b4a168f92e4116a283a97e8 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -61,7 +61,8 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   if (it == device_contexts_.end()) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Place %s is not supported. Please check that your paddle compiles "
-        "with WITH_GPU option or check that your train process hold the "
+        "with WITH_GPU or WITH_XPU option or check that your train process "
+        "hold the "
         "correct gpu_id if you use Executor.",
         place));
   }
@@ -115,6 +116,14 @@ DeviceContextPool::DeviceContextPool(
       PADDLE_THROW(platform::errors::Unimplemented(
           "CUDAPlace is not supported. Please re-compile with WITH_GPU "
           "option."));
+#endif
+    } else if (platform::is_xpu_place(p)) {
+#ifdef PADDLE_WITH_XPU
+      EmplaceDeviceContext<XPUDeviceContext, XPUPlace>(&device_contexts_, p);
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("XPUPlace is not supported. Please "
+                                          "re-compile with WITH_XPU option."));
 #endif
     }
   }
@@ -134,6 +143,49 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
 
 Place CPUDeviceContext::GetPlace() const { return place_; }
 
+#ifdef PADDLE_WITH_XPU
+XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); }
+
+XPUDeviceContext::~XPUDeviceContext() { xpu::destroy_context(context_); }
+
+XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  context_ = xpu::create_context();
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+}
+
+void XPUDeviceContext::Wait() const {
+  int ret = xpu_set_device(place_.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  xpu_wait();
+}
+
+Place XPUDeviceContext::GetPlace() const { return place_; }
+
+xpu::Context* XPUDeviceContext::x_context() const { return context_; }
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 
 class EigenCudaStreamDevice : public Eigen::StreamInterface {
@@ -412,9 +464,21 @@ MKLDNNDeviceContextThreadLocals::Body::get_cur_paddle_data_layout(void) {
   return cur_paddle_data_layout;
 }
 
-void MKLDNNDeviceContext::ResetBlobMap() const {
-  VLOG(3) << "Clearing DNNL cache.";
-  p_blobmap_->clear();
+void MKLDNNDeviceContext::ResetBlobMap() {
+  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
+  if (!block_next_cache_clearing_) {
+    VLOG(3) << "Clearing DNNL cache.";
+    p_blobmap_->clear();
+  } else {
+    VLOG(3) << "Prevented Clearing DNNL cache.";
+    block_next_cache_clearing_ = false;
+  }
+}
+
+void MKLDNNDeviceContext::BlockNextCacheClearing() {
+  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
+  VLOG(3) << "Next DNNL cache clearing has been blocked.";
+  block_next_cache_clearing_ = true;
 }
 
 size_t MKLDNNDeviceContext::GetShapeBlobSize() const {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 7511edb9ccf2c6ca1d5aea2964799b8be08064b6..8bfdfc8a1c6033a79c197e1cd425197f77079bda 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -43,6 +43,10 @@ limitations under the License. */
 #endif
 #include "unsupported/Eigen/CXX11/Tensor"
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
+
 namespace paddle {
 namespace platform {
 
@@ -76,6 +80,35 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
   using TYPE = CPUDeviceContext;
 };
 
+#ifdef PADDLE_WITH_XPU
+class XPUDeviceContext : public DeviceContext {
+ public:
+  XPUDeviceContext();
+  explicit XPUDeviceContext(XPUPlace place);
+  virtual ~XPUDeviceContext();
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  Place GetPlace() const override;
+  xpu::Context* x_context() const;
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+ private:
+  XPUPlace place_;
+  xpu::Context* context_;
+
+  // Need to be the same with other DeviceContext,
+  // Eventhough eigen_device_ is not used in XPU
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  DISABLE_COPY_AND_ASSIGN(XPUDeviceContext);
+};
+
+template <>
+struct DefaultDeviceContextType<platform::XPUPlace> {
+  using TYPE = XPUDeviceContext;
+};
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 
 class EigenCudaStreamDevice;
@@ -487,7 +520,10 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   const mkldnn::engine& GetEngine() const { return engine_; }
 
   // Remove all entries from the blob map
-  void ResetBlobMap() const;
+  void ResetBlobMap();
+
+  // Prevent next ResetBlobMap()
+  void BlockNextCacheClearing();
 
   // Get the ShapeBlob size in cur_mkldnn_session_id.
   size_t GetShapeBlobSize() const;
@@ -506,6 +542,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   mkldnn::engine engine_;
   std::shared_ptr<BlobMap> p_blobmap_;
   std::shared_ptr<std::mutex> p_mutex_;
+  bool block_next_cache_clearing_ = false;
 };
 #endif
 
diff --git a/paddle/fluid/platform/device_context_xpu_test.cc b/paddle/fluid/platform/device_context_xpu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3de2e3957a990a254ffb762f996876a122a865bc
--- /dev/null
+++ b/paddle/fluid/platform/device_context_xpu_test.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/device_context.h"
+
+#include <vector>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+TEST(Device, Init) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::XPUDeviceContext;
+  using paddle::platform::XPUPlace;
+
+  int count = paddle::platform::GetXPUDeviceCount();
+  for (int i = 0; i < count; i++) {
+    XPUDeviceContext* device_context = new XPUDeviceContext(XPUPlace(i));
+    xpu::Context* ctx = device_context->x_context();
+    ASSERT_NE(nullptr, ctx);
+    delete device_context;
+  }
+}
+
+TEST(Device, DeviceContextPool) {
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::XPUDeviceContext;
+  using paddle::platform::Place;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::XPUPlace;
+
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
+  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
+
+  std::vector<Place> xpu_places;
+  int count = paddle::platform::GetXPUDeviceCount();
+  for (int i = 0; i < count; ++i) {
+    auto dev_ctx = pool.Get(XPUPlace(i));
+    ASSERT_NE(dev_ctx, nullptr);
+  }
+}
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 8667375c6f2726f1099c6e57c6e793252b01d454..af8798a4b7cf5a8832ce9345cad45ce3096484e4 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -508,3 +508,16 @@ DEFINE_int32(
     "summary will be shown."
     "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
     "error message summary will be shown.");
+
+/**
+ * Debug related FLAG
+ * Name: sort_sum_gradient
+ * Since Version: 2.0.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: If True, gradients are summed by the reverse order of
+ * the forward execution sequence.
+ */
+DEFINE_bool(sort_sum_gradient, false,
+            "Sum gradients by the reverse order of "
+            "the forward execution sequence.");
diff --git a/paddle/fluid/platform/gloo_context.cc b/paddle/fluid/platform/gloo_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32e7299d319c91891c7c05dd1e8cfa85e99a0422
--- /dev/null
+++ b/paddle/fluid/platform/gloo_context.cc
@@ -0,0 +1,33 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/gloo_context.h"
+
+namespace paddle {
+namespace platform {
+#if defined(PADDLE_WITH_GLOO)
+void GlooParallelContext::Init() {
+  auto gloo_ptr = paddle::framework::GlooWrapper::GetInstance();
+  gloo_ptr->SetRank(strategy_.rank);
+  gloo_ptr->SetSize(strategy_.rank_num);
+  gloo_ptr->SetPrefix(strategy_.prefix);
+  gloo_ptr->SetIface(strategy_.iface);
+  gloo_ptr->SetTimeoutSeconds(strategy_.init_seconds, strategy_.run_seconds);
+  gloo_ptr->SetHdfsStore(strategy_.path, strategy_.fs_name, strategy_.fs_ugi);
+  gloo_ptr->Init();
+}
+#endif
+
+}  //  namespace platform
+}  //  namespace paddle
diff --git a/paddle/fluid/platform/gloo_context.h b/paddle/fluid/platform/gloo_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7dcf288a22c71c29fc22ec5e131249662214b8d
--- /dev/null
+++ b/paddle/fluid/platform/gloo_context.h
@@ -0,0 +1,51 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+
+namespace paddle {
+namespace platform {
+
+#if defined(PADDLE_WITH_GLOO)
+struct GlooParallelStrategy {
+  int rank{0};
+  int rank_num{1};
+  std::string iface;
+  std::string prefix;
+  int init_seconds{9999999};
+  int run_seconds{9999999};
+  std::string path;
+  std::string fs_name;
+  std::string fs_ugi;
+};
+
+class GlooParallelContext {
+ public:
+  explicit GlooParallelContext(const GlooParallelStrategy& strategy)
+      : strategy_(strategy) {}
+
+  virtual ~GlooParallelContext() {}
+
+  virtual void Init();
+
+ protected:
+  GlooParallelStrategy strategy_;
+};
+#endif
+
+}  //  namespace platform
+}  //  namespace paddle
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 261f6e807a22d328a20156bed8ee9974637898c3..2e708e44fd0e49e1c33e048084d15e13c6e4d57e 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -33,6 +33,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
+
 DECLARE_int32(paddle_num_threads);
 DEFINE_int32(multiple_of_cupti_buffer_size, 1,
              "Multiple of the CUPTI device buffer size. If the timestamps have "
@@ -151,6 +156,14 @@ void InitDevices(bool init_p2p) {
   } catch (const std::exception &exp) {
     LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
   }
+#endif
+#ifdef PADDLE_WITH_XPU
+  try {
+    // use user specified XPUs in single-node multi-process mode.
+    devices = platform::GetXPUSelectedDevices();
+  } catch (const std::exception &exp) {
+    LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
+  }
 #endif
   InitDevices(init_p2p, devices);
 }
@@ -165,7 +178,13 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
       LOG(WARNING) << "Invalid devices id.";
       continue;
     }
+
+#ifdef PADDLE_WITH_CUDA
     places.emplace_back(platform::CUDAPlace(devices[i]));
+#endif
+#ifdef PADDLE_WITH_XPU
+    places.emplace_back(platform::XPUPlace(devices[i]));
+#endif
   }
   if (init_p2p) {
     InitP2P(devices);
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index 6392c4f4c42af9030e9dd0b3373df60938a4676f..f14fbdd74f95bfbed53ff787af861ce4656159c0 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -20,7 +20,7 @@ TEST(InitDevices, CPU) {
   using paddle::framework::InitDevices;
   using paddle::platform::DeviceContextPool;
 
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU)
   InitDevices(true);
   DeviceContextPool& pool = DeviceContextPool::Instance();
   ASSERT_EQ(pool.size(), 1U);
@@ -39,6 +39,18 @@ TEST(InitDevices, CUDA) {
 #endif
 }
 
+TEST(InitDevices, XPU) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+
+#ifdef PADDLE_WITH_XPU
+  int count = paddle::platform::GetXPUDeviceCount();
+  InitDevices(true);
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
+#endif
+}
+
 #ifndef _WIN32
 TEST(SignalHandle, SignalHandle) {
   std::string msg = "Signal raises";
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index c74c47b7d84820f089d4e657f8bddccc5de8d727..3782eb684f21f8c09e9dac124082ae596fe5d1bc 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -129,6 +129,16 @@ inline void ClearMKLDNNCache(const platform::Place& place) {
   }
 }
 
+inline void DontClearMKLDNNCache(const platform::Place& place) {
+  // Clear mkl-dnn cache,
+  if (platform::is_cpu_place(place)) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::MKLDNNDeviceContext* dev_ctx =
+        (platform::MKLDNNDeviceContext*)pool.Get(place);
+    dev_ctx->BlockNextCacheClearing();
+  }
+}
+
 template <typename Type>
 mkldnn::memory::data_type MKLDNNGetDataType() {
   return mkldnn::memory::data_type::undef;
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 5d7143f56b3f394bb1a99c1b3802b7c20138dfb7..d1c5480c0f5438826e6eb5cc0de211ee1af74cf7 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -82,17 +82,21 @@ class MKLDNNHandlerT {
         fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src_mem_p");
   }
 
+  template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
-    T* ptr = output->mutable_data<T>(place_, fwd_pd_->dst_desc().get_size());
+    T_out* ptr =
+        output->mutable_data<T_out>(place_, fwd_pd_->dst_desc().get_size());
     return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr,
                                             "@dst_mem_p");
   }
 
+  template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(
       const framework::Tensor* output) {
-    const T* output_data = output->data<T>();
-    return this->AcquireMemoryFromPrimitive(
-        bwd_pd_->dst_desc(), to_void_cast<T>(output_data), "@bwd-dst_mem_p");
+    const T_out* output_data = output->data<T_out>();
+    return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(),
+                                            to_void_cast<T_out>(output_data),
+                                            "@bwd-dst_mem_p");
   }
 
   std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 195acc1b6d15a91369d48164179cd6e0b5cfac8d..b80d2fd1632cd82c231fae724fc4d754b8fed0fc 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -32,6 +32,7 @@ class PlacePrinter : public boost::static_visitor<> {
   void operator()(const CUDAPlace &p) {
     os_ << "CUDAPlace(" << p.device << ")";
   }
+  void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
   void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
 
  private:
@@ -44,6 +45,10 @@ bool is_gpu_place(const Place &p) {
   return boost::apply_visitor(IsCUDAPlace(), p);
 }
 
+bool is_xpu_place(const Place &p) {
+  return boost::apply_visitor(IsXPUPlace(), p);
+}
+
 bool is_cpu_place(const Place &p) {
   return boost::apply_visitor(IsCPUPlace(), p);
 }
@@ -60,6 +65,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
   if (places_are_same_class(p1, p2)) {
     if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
       return true;
+    } else if (is_xpu_place(p1)) {
+      return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
     } else {
       return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
     }
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index eeda10a633b655dee0da9197888738cd94b50809..f95f6954a32e771e7413a766afcfea8b85ff1f7e 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -58,31 +58,58 @@ struct CUDAPinnedPlace {
   inline bool operator<(const CUDAPinnedPlace &) const { return false; }
 };
 
+// Place for Baidu Kunlun Accelerator
+struct XPUPlace {
+  XPUPlace() : XPUPlace(0) {}
+  explicit XPUPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const XPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const XPUPlace &o) const { return !(*this == o); }
+  inline bool operator<(const XPUPlace &o) const { return device < o.device; }
+
+  int device;
+};
+
 struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &gpu) const { return true; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
 struct IsCPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &cpu) const { return true; }
+  bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
 struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };
 
-class Place : public boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> {
+struct IsXPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &xpu) const { return true; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+
+class Place
+    : public boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace> {
  private:
-  using PlaceBase = boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace>;
+  using PlaceBase =
+      boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace>;
 
  public:
   Place() = default;
   Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {}     // NOLINT
+  Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {}     // NOLINT
   Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
   Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
       : PlaceBase(cuda_pinned_place) {}
@@ -98,6 +125,7 @@ class Place : public boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> {
 using PlaceList = std::vector<Place>;
 
 bool is_gpu_place(const Place &);
+bool is_xpu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
@@ -115,6 +143,16 @@ struct PlaceVisitorWrapper
     return visitor_(cpu);
   }
 
+  typename Visitor::result_type operator()(const XPUPlace &xpu) const {
+#ifdef PADDLE_WITH_XPU
+    return visitor_(xpu);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with XPU. Cannot visit xpu device"));
+    return typename Visitor::result_type();
+#endif
+  }
+
   typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #ifdef PADDLE_WITH_CUDA
     return visitor_(cuda);
diff --git a/paddle/fluid/platform/place_test.cc b/paddle/fluid/platform/place_test.cc
index e4c1d3def90f191194b46bb9ea27dd27d69dcb8b..13f28c73f4504aea85d6155a3daa8f8f01b26385 100644
--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
@@ -18,19 +18,32 @@
 TEST(Place, Equality) {
   paddle::platform::CPUPlace cpu;
   paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
+  paddle::platform::XPUPlace x0(0), x1(1), xx0(0);
 
   EXPECT_EQ(cpu, cpu);
   EXPECT_EQ(g0, g0);
   EXPECT_EQ(g1, g1);
   EXPECT_EQ(g0, gg0);
+  EXPECT_EQ(x0, x0);
+  EXPECT_EQ(x1, x1);
+  EXPECT_EQ(x0, xx0);
 
   EXPECT_NE(g0, g1);
+  EXPECT_NE(x0, x1);
 
   EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
+  EXPECT_TRUE(paddle::platform::places_are_same_class(x0, xx0));
   EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(x0, cpu));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, x0));
 }
 
 TEST(Place, Print) {
+  {
+    std::stringstream ss;
+    ss << paddle::platform::XPUPlace(1);
+    EXPECT_EQ("XPUPlace(1)", ss.str());
+  }
   {
     std::stringstream ss;
     ss << paddle::platform::CUDAPlace(1);
diff --git a/paddle/fluid/imperative/backward_strategy.h b/paddle/fluid/platform/xpu_header.h
similarity index 53%
rename from paddle/fluid/imperative/backward_strategy.h
rename to paddle/fluid/platform/xpu_header.h
index 0f04d6db8e63d5d069745ed1895df774e69d60d0..d8c5f85f9cfe4b9d6ac07069fff89d37c695af5b 100644
--- a/paddle/fluid/imperative/backward_strategy.h
+++ b/paddle/fluid/platform/xpu_header.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,22 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//
-// Created by Jiabin on 2019-04-25.
-//
 #pragma once
 
-namespace paddle {
-namespace imperative {
-namespace detail {
-
-struct BackwardStrategy {
-  /* DyGraph now support two kinds of backward strategy, one is sorted sum
-   * gradient, another is sum gradient once they are created */
-  // TODO(jiabin): add more Strategy when we support
-  bool sorted_sum_gradient_{false};
-};
+#ifdef PADDLE_WITH_XPU
+#include "xpu/api.h"
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
 
-}  // namespace detail
-}  // namespace imperative
-}  // namespace paddle
+namespace xpu = baidu::xpu::api;
+#endif
diff --git a/paddle/fluid/platform/xpu_info.cc b/paddle/fluid/platform/xpu_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f88248fda7e65e1b96448c0576880a18a9d8a4a9
--- /dev/null
+++ b/paddle/fluid/platform/xpu_info.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/xpu_info.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <string>
+#include "gflags/gflags.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/string/split.h"
+
+DEFINE_string(selected_xpus, "",
+              "A list of device ids separated by comma, like: 0,1,2,3. "
+              "This option is useful when doing multi process training and "
+              "each process have only one device (XPU). If you want to use "
+              "all visible devices, set this to empty string. NOTE: the "
+              "reason of doing this is that we want to use P2P communication"
+              "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
+              "share-memory only.");
+
+namespace paddle {
+namespace platform {
+
+static int GetXPUDeviceCountImpl() {
+  const auto *xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
+  if (xpu_visible_devices != nullptr) {
+    std::string xpu_visible_devices_str(xpu_visible_devices);
+    if (std::all_of(xpu_visible_devices_str.begin(),
+                    xpu_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected.";
+      return 0;
+    }
+  }
+
+  int count = 0;
+  int ret = xpu_device_count(&count);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  return count;
+}
+
+int GetXPUDeviceCount() {
+  static auto dev_cnt = GetXPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetXPUCurrentDeviceId() {
+  int dev_id;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  return dev_id;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices() {
+  // use user specified XPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_xpus.empty()) {
+    auto devices_str = paddle::string::Split(FLAGS_selected_xpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetXPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+void SetXPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(
+      id, GetXPUDeviceCount(),
+      platform::errors::InvalidArgument("id must less than XPU count"));
+  int ret = xpu_set_device(id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/xpu_info.h b/paddle/fluid/platform/xpu_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..efaba13453e7472ed09ff66c70bdaf19eb89549d
--- /dev/null
+++ b/paddle/fluid/platform/xpu_info.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <vector>
+
+namespace paddle {
+namespace platform {
+
+//! Get the total number of XPU devices in system.
+int GetXPUDeviceCount();
+
+//! Get the current XPU device id in system.
+int GetXPUCurrentDeviceId();
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices();
+
+//! Set the XPU device id for next execution.
+void SetXPUDeviceId(int device_id);
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index ef19fcc5475558eccf79867a03df0b8beba8ad08..d733cf26ed209bcb86eaf2d366e45cfa0e7f9a90 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -40,6 +40,11 @@ set(PYBIND_SRCS
   inference_api.cc
   generator_py.cc)
 
+if(WITH_GLOO)
+  set(PYBIND_DEPS ${PYBIND_DEPS} gloo_context)
+  set(PYBIND_SRCS ${PYBIND_SRCS} gloo_context_py.cc)
+endif(WITH_GLOO)
+
 if (WITH_CRYPTO)
   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_crypto)
   set(PYBIND_SRCS ${PYBIND_SRCS} crypto.cc)
@@ -72,8 +77,23 @@ if(WITH_PYTHON)
   set(tmp_impl_file ${impl_file}.tmp)
 
   if(WIN32)
+    file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat ""
+    "set build_times=1\n"
+    ":retry\n"
+    "ECHO op_function_generator run %build_times% time\n"
+    "${CMAKE_BINARY_DIR}/paddle/fluid/pybind/${CMAKE_BUILD_TYPE}/op_function_generator ${impl_file}\n"
+    "if %ERRORLEVEL% NEQ 0 (\n"
+    "    set /a build_times=%build_times%+1\n"
+    "    if %build_times% GTR 100 (\n"
+    "        exit /b 1\n"
+    "    ) else (\n"
+    "        goto :retry\n"
+    "    )\n"
+    ")\n"
+    "exit /b 0")
+
     add_custom_command(TARGET op_function_generator POST_BUILD
-          COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/${CMAKE_BUILD_TYPE}/op_function_generator ${impl_file}
+          COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
     )
 
     if(${CBLAS_PROVIDER} STREQUAL MKLML)
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 3bccd5fb2dd92298323381c09467937abd87a53c..90b7f501052530a306ba22ea6a244f0ef8fad563 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -29,23 +29,36 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace pybind {
-void BindGenerator(py::module* m) {
-  py::class_<framework::GeneratorState>(*m, "GeneratorState", "");
-  py::class_<std::mt19937_64>(*m, "mt19937_64", "");
+void BindGenerator(py::module* m_ptr) {
+  auto& m = *m_ptr;
+  py::class_<framework::GeneratorState,
+             std::shared_ptr<framework::GeneratorState>>(m, "GeneratorState")
+      .def("current_seed",
+           [](std::shared_ptr<framework::GeneratorState>& self) {
+             return self->current_seed;
+           });
+  py::class_<std::mt19937_64>(m, "mt19937_64", "");
   py::class_<framework::Generator, std::shared_ptr<framework::Generator>>(
-      *m, "Generator")
-      .def(py::init([]() { return framework::Generator::GetInstanceX(); }),
-           py::return_value_policy::reference)
-      .def("get_state", &framework::Generator::GetState,
-           py::return_value_policy::move)
+      m, "Generator")
+      .def("__init__",
+           [](framework::Generator& self) {
+             new (&self) framework::Generator();
+           })
+      .def("get_state", &framework::Generator::GetState)
       .def("set_state", &framework::Generator::SetState)
-      .def("manual_seed", &framework::Generator::SetCurrentSeed)
+      .def("manual_seed",
+           [](std::shared_ptr<framework::Generator>& self, uint64_t seed) {
+             self->SetCurrentSeed(seed);
+             return self;
+           })
       .def("seed", &framework::Generator::Seed)
       .def("initial_seed", &framework::Generator::GetCurrentSeed)
       .def("random", &framework::Generator::Random64)
-      .def("get_cpu_engine", &framework::Generator::GetCPUEngine,
-           py::return_value_policy::move)
-      .def("set_cpu_engine", &framework::Generator::SetCPUEngine);
+      //  .def("get_cpu_engine", &framework::Generator::GetCPUEngine)
+      //  .def("set_cpu_engine", &framework::Generator::SetCPUEngine)
+      .def_property("_is_init_py", &framework::Generator::GetIsInitPy,
+                    &framework::Generator::SetIsInitPy);
+  m.def("default_cpu_generator", &framework::DefaultCPUGenerator);
 }  // end Generator
 }  // end namespace pybind
-}  // end namespace paddle
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index deca9625e63d05625c407a1282b396398bb78ccc..f1084018d9c79e46c33098dafdb48dc395dac652 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -38,6 +38,7 @@ DECLARE_bool(enable_rpc_profiler);
 DECLARE_int32(multiple_of_cupti_buffer_size);
 DECLARE_bool(reader_queue_speed_test_mode);
 DECLARE_int32(call_stack_level);
+DECLARE_bool(sort_sum_gradient);
 // device management
 DECLARE_int32(paddle_num_threads);
 // executor
@@ -340,7 +341,7 @@ static void RegisterGlobalVarGetterSetter() {
   REGISTER_PUBLIC_GLOBAL_VAR(
       FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
       FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
-      FLAGS_call_stack_level, FLAGS_cpu_deterministic,
+      FLAGS_call_stack_level, FLAGS_sort_sum_gradient, FLAGS_cpu_deterministic,
       FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
       FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
       FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,
diff --git a/paddle/fluid/pybind/gloo_context_py.cc b/paddle/fluid/pybind/gloo_context_py.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a9c77b0c3a06ca9a17f33643d88ddf932c32544
--- /dev/null
+++ b/paddle/fluid/pybind/gloo_context_py.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/gloo_context_py.h"
+
+#include <Python.h>
+#include <pybind11/chrono.h>
+#include <pybind11/complex.h>
+#include <pybind11/functional.h>
+#include <pybind11/stl.h>
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
+#include "paddle/fluid/platform/gloo_context.h"
+
+namespace paddle {
+namespace pybind {
+
+namespace py = ::pybind11;
+
+// Bind Methods
+void BindGlooContext(py::module *m) {
+// define parallel context for gloo
+#if defined(PADDLE_WITH_GLOO)
+  py::class_<platform::GlooParallelStrategy> gloo_parallel_strategy(
+      *m, "GlooParallelStrategy", "");
+  gloo_parallel_strategy.def(py::init())
+      .def_property("rank_num",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.rank_num;
+                    },
+                    [](platform::GlooParallelStrategy &self, int nranks) {
+                      self.rank_num = nranks;
+                    })
+      .def_property(
+          "rank",
+          [](const platform::GlooParallelStrategy &self) { return self.rank; },
+          [](platform::GlooParallelStrategy &self, int rank) {
+            self.rank = rank;
+          })
+      .def_property(
+          "iface",
+          [](const platform::GlooParallelStrategy &self) { return self.iface; },
+          [](platform::GlooParallelStrategy &self, const std::string &iface) {
+            self.iface = iface;
+          })
+      .def_property("prefix",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.prefix;
+                    },
+                    [](platform::GlooParallelStrategy &self,
+                       const std::string &prefix) { self.prefix = prefix; })
+      .def_property("init_seconds",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.init_seconds;
+                    },
+                    [](platform::GlooParallelStrategy &self, int init_seconds) {
+                      self.init_seconds = init_seconds;
+                    })
+      .def_property("run_seconds",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.run_seconds;
+                    },
+                    [](platform::GlooParallelStrategy &self, int run_seconds) {
+                      self.run_seconds = run_seconds;
+                    })
+      .def_property(
+          "path",
+          [](const platform::GlooParallelStrategy &self) { return self.path; },
+          [](platform::GlooParallelStrategy &self, const std::string &path) {
+            self.path = path;
+          })
+      .def_property("fs_name",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.fs_name;
+                    },
+                    [](platform::GlooParallelStrategy &self,
+                       const std::string &fs_name) { self.fs_name = fs_name; })
+      .def_property("fs_ugi",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.fs_ugi;
+                    },
+                    [](platform::GlooParallelStrategy &self,
+                       const std::string &fs_ugi) { self.fs_ugi = fs_ugi; });
+
+  py::class_<platform::GlooParallelContext> gloo_ctx(*m, "GlooParallelContext");
+  gloo_ctx.def(py::init<const platform::GlooParallelStrategy &>())
+      .def("init", [](platform::GlooParallelContext &self) { self.Init(); });
+#endif
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/gloo_context_py.h b/paddle/fluid/pybind/gloo_context_py.h
new file mode 100644
index 0000000000000000000000000000000000000000..89bd183097b7541c33a797f27178bafb934bcd52
--- /dev/null
+++ b/paddle/fluid/pybind/gloo_context_py.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <Python.h>
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindGlooContext(pybind11::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index be55201595e34ef7fa520908b56208b8a6ff1895..489dd198876204486fc94518fbef0c806d0543d4 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -30,7 +30,6 @@ limitations under the License. */
 
 #include "paddle/fluid/imperative/all_reduce.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
-#include "paddle/fluid/imperative/backward_strategy.h"
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/data_loader.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -66,11 +65,13 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
     return place_obj.cast<platform::CPUPlace>();
   } else if (py::isinstance<platform::CUDAPlace>(place_obj)) {
     return place_obj.cast<platform::CUDAPlace>();
+  } else if (py::isinstance<platform::XPUPlace>(place_obj)) {
+    return place_obj.cast<platform::XPUPlace>();
   } else if (py::isinstance<platform::CUDAPinnedPlace>(place_obj)) {
     return place_obj.cast<platform::CUDAPinnedPlace>();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "Place should be one of CPUPlace/CUDAPlace/CUDAPinnedPlace"));
+        "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
   }
 }
 
@@ -92,6 +93,9 @@ static void InitTensorForVarBase(imperative::VarBase *self,
   if (platform::is_cpu_place(place)) {
     SetTensorFromPyArray<platform::CPUPlace>(
         tensor, array, BOOST_GET_CONST(platform::CPUPlace, place), zero_copy);
+  } else if (platform::is_xpu_place(place)) {
+    SetTensorFromPyArray<platform::XPUPlace>(
+        tensor, array, BOOST_GET_CONST(platform::XPUPlace, place), zero_copy);
   } else if (platform::is_gpu_place(place)) {
     SetTensorFromPyArray<platform::CUDAPlace>(
         tensor, array, BOOST_GET_CONST(platform::CUDAPlace, place), zero_copy);
@@ -101,7 +105,7 @@ static void InitTensorForVarBase(imperative::VarBase *self,
         zero_copy);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "Place should be one of CPUPlace/CUDAPlace/CUDAPinnedPlace"));
+        "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
   }
   if (stop_gradient != -1) {
     self->SetOverridedStopGradient(stop_gradient);
@@ -502,50 +506,6 @@ void BindImperative(py::module *m_ptr) {
         []() { memory::allocation::MemoryMapFdSet::Instance().Clear(); });
 #endif
 
-  py::class_<imperative::detail::BackwardStrategy> backward_strategy(
-      m, "BackwardStrategy", R"DOC(
-
-    BackwardStrategy is a descriptor of how to run the backward process.
-
-    **Note**:
-        **This API is only available in** `Dygraph <../../user_guides/howto/dygraph/DyGraph.html>`_ **Mode**
-
-    Attribute:
-        **sort_sum_gradient**:
-
-        If framework will sum the gradient by the reverse order of trace. eg. x_var ( :ref:`api_guide_Variable` ) will be the input of multiple OP such as :ref:`api_fluid_layers_scale` , this attr will decide if framework will sum gradient of `x_var` by the reverse order.
-
-        By Default: False
-
-        Examples:
-            .. code-block:: python
-
-                import numpy as np
-                import paddle.fluid as fluid
-
-                x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    x_var = fluid.dygraph.to_variable(x)
-                    sums_inputs = []
-                    # x_var will be multi-scales' input here
-                    for _ in range(10):
-                        sums_inputs.append(fluid.layers.scale(x_var))
-                    ret2 = fluid.layers.sums(sums_inputs)
-                    loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
-      )DOC");
-  backward_strategy.def(py::init())
-      .def_property("sort_sum_gradient",
-                    [](const imperative::detail::BackwardStrategy &self) {
-                      return self.sorted_sum_gradient_;
-                    },
-                    [](imperative::detail::BackwardStrategy &self,
-                       bool sorted_sum_gradient) {
-                      self.sorted_sum_gradient_ = sorted_sum_gradient;
-                    });
-
   m.def("start_imperative_gperf_profiler",
         []() { imperative::StartProfile(); });
 
@@ -588,6 +548,10 @@ void BindImperative(py::module *m_ptr) {
            py::arg("value"), py::arg("place"), py::arg("persistable") = false,
            py::arg("zero_copy") = false, py::arg("name") = "",
            py::arg("stop_gradient") = -1)
+      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::XPUPlace>,
+           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
       .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CUDAPlace>,
            py::arg("value"), py::arg("place"), py::arg("persistable") = false,
            py::arg("zero_copy") = false, py::arg("name") = "",
@@ -736,21 +700,18 @@ void BindImperative(py::module *m_ptr) {
                          inputs2.append(tmp)
                     ret2 = fluid.layers.sums(inputs2)
                     loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     print(loss2.gradient())
                     loss2.clear_gradient()
                     print("After clear {}".format(loss2.gradient()))
       )DOC")
       .def("_run_backward",
-           [](imperative::VarBase &self,
-              const imperative::detail::BackwardStrategy &bckst,
-              const imperative::Tracer &tracer, bool retain_graph) {
+           [](imperative::VarBase &self, const imperative::Tracer &tracer,
+              bool retain_graph) {
              // TODO(jiabin): when we impl more backward execution we can
              // select them
              auto *engine = tracer.GetEngine();
-             engine->Init(&self, bckst, retain_graph);
+             engine->Init(&self, retain_graph);
              VLOG(3) << "Start backward";
              engine->Execute();
              VLOG(3) << "Finish backward";
@@ -823,6 +784,10 @@ void BindImperative(py::module *m_ptr) {
               const platform::CUDAPinnedPlace &place,
               bool blocking) { return self.NewVarBase(place, blocking); },
            py::return_value_policy::copy)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::XPUPlace &place,
+              bool blocking) { return self.NewVarBase(place, blocking); },
+           py::return_value_policy::copy)
       .def("_copy_to",
            [](const imperative::VarBase &self, const platform::CUDAPlace &place,
               bool blocking) { return self.NewVarBase(place, blocking); },
@@ -890,6 +855,9 @@ void BindImperative(py::module *m_ptr) {
             if (py::isinstance<platform::CUDAPlace>(obj)) {
               auto p = obj.cast<platform::CUDAPlace *>();
               self.SetExpectedPlace(*p);
+            } else if (py::isinstance<platform::XPUPlace>(obj)) {
+              auto p = obj.cast<platform::XPUPlace *>();
+              self.SetExpectedPlace(*p);
             } else if (py::isinstance<platform::CPUPlace>(obj)) {
               auto p = obj.cast<platform::CPUPlace *>();
               self.SetExpectedPlace(*p);
@@ -898,7 +866,8 @@ void BindImperative(py::module *m_ptr) {
               self.SetExpectedPlace(*p);
             } else {
               PADDLE_THROW(platform::errors::InvalidArgument(
-                  "Incompatible Place Type: supports CUDAPlace, CPUPlace, "
+                  "Incompatible Place Type: supports XPUPlace, CUDAPlace, "
+                  "CPUPlace, "
                   "and CUDAPinnedPlace, "
                   "but got Unknown Type!"));
             }
@@ -928,6 +897,19 @@ void BindImperative(py::module *m_ptr) {
                  *(imperative::AmpOperators::Instance().GetAllowOps()),
                  *(imperative::AmpOperators::Instance().GetBlockOps()));
            })
+      .def("trace",
+           [](imperative::Tracer &self, const std::string &type,
+              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs, const platform::XPUPlace &place,
+              bool trace_backward) {
+             auto ins_map = ConvertToNameVarBaseMap(ins);
+             auto outs_map = ConvertToNameVarBaseMap(outs);
+             {
+               py::gil_scoped_release release;
+               self.TraceOp(type, std::move(ins_map), std::move(outs_map),
+                            std::move(attrs), place, trace_backward);
+             }
+           })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
@@ -994,13 +976,11 @@ void BindImperative(py::module *m_ptr) {
              &output_targets,
          const std::vector<std::shared_ptr<imperative::VarBase>> &output_grads,
          const std::vector<std::shared_ptr<imperative::VarBase>> &no_grad_vars,
-         const platform::Place &place,
-         const imperative::detail::BackwardStrategy &strategy,
-         bool create_graph, bool retain_graph, bool allow_unused,
-         bool only_inputs) {
+         const platform::Place &place, bool create_graph, bool retain_graph,
+         bool allow_unused, bool only_inputs) {
         imperative::PartialGradEngine engine(
             input_targets, output_targets, output_grads, no_grad_vars, place,
-            strategy, create_graph, retain_graph, allow_unused, only_inputs);
+            create_graph, retain_graph, allow_unused, only_inputs);
         engine.Execute();
         return engine.GetResult();
       },
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 696da67c9c98fe16b28ceb05d5c07049104fd43b..cf0dac022f74e47261fc28d02665bcde49dc8b39 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -448,6 +448,7 @@ void BindAnalysisConfig(py::module *m) {
            &AnalysisConfig::cpu_math_library_num_threads)
       .def("to_native_config", &AnalysisConfig::ToNativeConfig)
       .def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableMkldnnBfloat16)
 #ifdef PADDLE_WITH_MKLDNN
       .def("quantizer_config", &AnalysisConfig::mkldnn_quantizer_config,
            py::return_value_policy::reference)
@@ -565,6 +566,7 @@ void BindPaddlePassBuilder(py::module *m) {
       .def("enable_cudnn", &PassStrategy::EnableCUDNN)
       .def("enable_mkldnn", &PassStrategy::EnableMKLDNN)
       .def("enable_mkldnn_quantizer", &PassStrategy::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &PassStrategy::EnableMkldnnBfloat16)
       .def("use_gpu", &PassStrategy::use_gpu);
 
   py::class_<CpuPassStrategy, PassStrategy>(*m, "CpuPassStrategy")
@@ -572,14 +574,16 @@ void BindPaddlePassBuilder(py::module *m) {
       .def(py::init<const CpuPassStrategy &>())
       .def("enable_cudnn", &CpuPassStrategy::EnableCUDNN)
       .def("enable_mkldnn", &CpuPassStrategy::EnableMKLDNN)
-      .def("enable_mkldnn_quantizer", &CpuPassStrategy::EnableMkldnnQuantizer);
+      .def("enable_mkldnn_quantizer", &CpuPassStrategy::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &CpuPassStrategy::EnableMkldnnBfloat16);
 
   py::class_<GpuPassStrategy, PassStrategy>(*m, "GpuPassStrategy")
       .def(py::init<>())
       .def(py::init<const GpuPassStrategy &>())
       .def("enable_cudnn", &GpuPassStrategy::EnableCUDNN)
       .def("enable_mkldnn", &GpuPassStrategy::EnableMKLDNN)
-      .def("enable_mkldnn_quantizer", &GpuPassStrategy::EnableMkldnnQuantizer);
+      .def("enable_mkldnn_quantizer", &GpuPassStrategy::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &GpuPassStrategy::EnableMkldnnBfloat16);
 }
 }  // namespace
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index a770275bf08c05cbc2c5474f456df8de289e9689..256faf04ea6de5835f22113537caac49ca1dbab4 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -41,6 +41,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"X", "InScale", "InAccum", "InState"}},
     {"nll_loss", {"X", "Label", "Weight"}},
+    {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}},
+    {"gather", {"X", "Index", "Axis"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -60,6 +62,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"sync_batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
+    {"unique", {"Out", "Index", "Indices", "Counts"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -83,6 +86,19 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"accuracy", {"Correct", "Total"}},
     {"fill_constant", {"Out"}},
     {"matmul", {"Out"}},
+    {"c_broadcast", {"Out"}},
+    {"c_allreduce_sum", {"Out"}},
+    {"c_allreduce_max", {"Out"}},
+    {"c_allreduce_min", {"Out"}},
+    {"c_allreduce_prod", {"Out"}},
+    {"c_reduce_sum", {"Out"}},
+    {"c_reduce_max", {"Out"}},
+    {"c_reduce_min", {"Out"}},
+    {"c_reduce_prod", {"Out"}},
+    {"c_reduce", {"Out"}},
+    {"c_allgather", {"Out"}},
+    {"c_scatter", {"Out"}},
+    {"barrier", {"Out"}},
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"Out", "OutScale", "OutAccum", "OutState"}},
     {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 635a81dff0d371c87628491a58acb33bcc3cffd1..4b8f7c853ceaf2148722a9c65f38e0ec3d9f4df5 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -66,6 +66,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/fleet_wrapper_py.h"
 #include "paddle/fluid/pybind/generator_py.h"
 #include "paddle/fluid/pybind/global_value_getter_setter.h"
+#include "paddle/fluid/pybind/gloo_context_py.h"
 #include "paddle/fluid/pybind/gloo_wrapper_py.h"
 #include "paddle/fluid/pybind/heter_wrapper_py.h"
 #include "paddle/fluid/pybind/imperative.h"
@@ -90,6 +91,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
+
 #ifdef PADDLE_WITH_DISTRIBUTE
 #include "paddle/fluid/pybind/communicator_py.h"
 #endif
@@ -118,6 +123,14 @@ bool IsCompiledWithCUDA() {
 #endif
 }
 
+bool IsCompiledWithXPU() {
+#ifndef PADDLE_WITH_XPU
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithMKLDNN() {
 #ifndef PADDLE_WITH_MKLDNN
   return false;
@@ -471,6 +484,10 @@ PYBIND11_MODULE(core_noavx, m) {
            [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<float>(place);
            })
+      .def("_alloc_float",
+           [](Tensor &self, paddle::platform::XPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
       .def("_alloc_float",
            [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<float>(place);
@@ -483,6 +500,10 @@ PYBIND11_MODULE(core_noavx, m) {
            [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<int>(place);
            })
+      .def("_alloc_int",
+           [](Tensor &self, paddle::platform::XPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
       .def("_alloc_int",
            [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<int>(place);
@@ -500,6 +521,11 @@ PYBIND11_MODULE(core_noavx, m) {
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
            })
+      .def("_mutable_data",
+           [](Tensor &self, paddle::platform::XPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
+           })
       .def("_mutable_data",
            [](Tensor &self, paddle::platform::CUDAPlace &place,
               paddle::framework::proto::VarType::Type type) {
@@ -513,6 +539,8 @@ PYBIND11_MODULE(core_noavx, m) {
       .def("_clear", &Tensor::clear)
       .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
+           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
@@ -522,7 +550,7 @@ PYBIND11_MODULE(core_noavx, m) {
         
         Args:
           lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|CUDAPinnedPlace): The place where the 
+          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the 
           LoDTensor is to be set.
           zero_copy (bool, optional): Whether to share memory with the input numpy array.
           This parameter only works with CPUPlace. Default: False.
@@ -1230,6 +1258,18 @@ All parameter, weight, gradient are variables in Paddle.
                       -> paddle::platform::DeviceContext* {
                     return new paddle::platform::CPUDeviceContext();
                   })
+      .def_static("create",
+                  [](paddle::platform::XPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_XPU
+             PADDLE_THROW(
+                 platform::errors::PermissionDenied(
+                 "Cannot use XPUPlace in CPU/GPU version, "
+                 "Please recompile or reinstall Paddle with XPU support."));
+#else
+                    return new paddle::platform::XPUDeviceContext(place);
+#endif
+                  })
       .def_static("create",
                   [](paddle::platform::CUDAPlace& place)
                       -> paddle::platform::DeviceContext* {
@@ -1331,6 +1371,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
       .def("_get_device_id",
@@ -1338,6 +1379,60 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
+  py::class_<platform::XPUPlace>(m, "XPUPlace", R"DOC(
+    **Note**:
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          xpu_place = fluid.XPUPlace(0)
+        )DOC")
+      .def("__init__",
+           [](platform::XPUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_XPU
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid XPUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) {
+               if (platform::GetXPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use XPU because there is no XPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid XPUPlace(%d), must inside [0, %d), because XPU "
+                     "number on your machine is %d",
+                     dev_id, platform::GetXPUDeviceCount(),
+                     platform::GetXPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::XPUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use XPU because you have installed CPU/GPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use XPU, please try to install XPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-xpu\n"
+                 "If you only have CPU, please change XPUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::XPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::XPUPlace, platform::CUDAPinnedPlace>)
+      .def("__str__", string::to_string<const platform::XPUPlace &>);
+
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
     CPUPlace is a descriptor of a device.
     It represents a CPU device allocated or to be allocated with Tensor or LoDTensor.
@@ -1352,6 +1447,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def(py::init<>())
       .def("_type", &PlaceIndex<platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
       .def("_equals",
@@ -1386,6 +1482,8 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
       .def("_equals",
@@ -1398,11 +1496,14 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
       .def("is_gpu_place",
            [](platform::Place &self) { return platform::is_gpu_place(self); })
       .def("is_cpu_place",
            [](platform::Place &self) { return platform::is_cpu_place(self); })
+      .def("is_xpu_place",
+           [](platform::Place &self) { return platform::is_xpu_place(self); })
       .def("is_cuda_pinned_place",
            [](platform::Place &self) {
              return platform::is_cuda_pinned_place(self);
@@ -1411,12 +1512,20 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) {
              return BOOST_GET_CONST(platform::CUDAPlace, self).device;
            })
+      .def("xpu_device_id",
+           [](platform::Place &self) {
+             return BOOST_GET_CONST(platform::XPUPlace, self).device;
+           })
       .def("set_place", [](platform::Place &self,
                            const platform::Place &other) { self = other; })
       .def("set_place",
            [](platform::Place &self, const platform::CPUPlace &cpu_place) {
              self = cpu_place;
            })
+      .def("set_place",
+           [](platform::Place &self, const platform::XPUPlace &xpu_place) {
+             self = xpu_place;
+           })
       .def("set_place",
            [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
              self = gpu_place;
@@ -1444,6 +1553,9 @@ All parameter, weight, gradient are variables in Paddle.
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::XPUPlace &place) { self.Run(scope, place); })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CUDAPlace &place) { self.Run(scope, place); })
@@ -1544,6 +1656,7 @@ All parameter, weight, gradient are variables in Paddle.
         [](bool init_p2p) { framework::InitDevices(init_p2p); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
@@ -2499,6 +2612,9 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #ifdef PADDLE_WITH_NCCL
   BindNCCLWrapper(&m);
+#endif
+#ifdef PADDLE_WITH_GLOO
+  BindGlooContext(&m);
 #endif
   BindGraph(&m);
   BindNode(&m);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index c16b22b9fc3aed95d6f55045a90ddad278cbd92e..4377a8c2cef5aab7a200955cd25830d448014817 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -145,8 +146,14 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
   T b = static_cast<T>(0);
   if (platform::is_cpu_place(self.place())) {
     b = self.data<T>()[offset];
+  } else if (platform::is_xpu_place(self.place())) {
+#ifdef PADDLE_WITH_XPU
+    const T *a = self.data<T>();
+    auto p = BOOST_GET_CONST(platform::XPUPlace, self.place());
+    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T));
+#endif
+  } else if (platform::is_gpu_place(self.place())) {
 #ifdef PADDLE_WITH_CUDA
-  } else {
     const T *a = self.data<T>();
     auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place());
     paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
@@ -163,8 +170,14 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
                         "The offset exceeds the size of tensor."));
   if (platform::is_cpu_place(self->place())) {
     self->mutable_data<T>(self->place())[offset] = elem;
+  } else if (platform::is_xpu_place(self->place())) {
+#ifdef PADDLE_WITH_XPU
+    auto p = BOOST_GET_CONST(platform::XPUPlace, self->place());
+    T *a = self->mutable_data<T>(p);
+    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T));
+#endif
+  } else if (platform::is_gpu_place(self->place())) {
 #ifdef PADDLE_WITH_CUDA
-  } else {
     auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place());
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
@@ -194,6 +207,16 @@ void SetTensorFromPyArrayT(
       auto dst = self->mutable_data<T>(place);
       std::memcpy(dst, array.data(), array.nbytes());
     }
+  } else if (paddle::platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    auto dst = self->mutable_data<T>(place);
+    xpu_memcpy(dst, array.data(), array.nbytes(),
+               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use XPUPlace in CPU/GPU version, "
+        "Please recompile or reinstall Paddle with XPU support."));
+#endif
   } else {
 #ifdef PADDLE_WITH_CUDA
     auto dst = self->mutable_data<T>(place);
@@ -354,8 +377,13 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
   if (platform::is_cpu_place(place)) {
     output->mutable_data(BOOST_GET_CONST(platform::CPUPlace, place),
                          self.type());
-#ifdef PADDLE_WITH_CUDA
+  } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    output->mutable_data(BOOST_GET_CONST(platform::XPUPlace, place),
+                         self.type());
+#endif
   } else {
+#ifdef PADDLE_WITH_CUDA
     if (platform::is_cuda_pinned_place(place)) {
       output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
                            self.type());
@@ -516,6 +544,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
     return py::array();
   }
   bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
+  bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
   const auto &tensor_dims = tensor.dims();
   auto tensor_dtype = tensor.type();
   size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
@@ -534,11 +563,11 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
 
   std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());
 
-  if (!is_gpu_tensor) {
+  if (!is_gpu_tensor && !is_xpu_tensor) {
     if (!need_deep_copy) {
-      return py::array(py::buffer_info(
-          const_cast<void *>(tensor_buf_ptr), sizeof_dtype, py_dtype_str,
-          static_cast<size_t>(tensor.dims().size()), py_dims, py_strides));
+      auto base = py::cast(std::move(tensor));
+      return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
+                       const_cast<void *>(tensor_buf_ptr), base);
     } else {
       py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
       PADDLE_ENFORCE_EQ(
@@ -557,28 +586,54 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
                            copy_bytes);
       return py_arr;
     }
-  }
-
+  } else if (is_xpu_tensor) {
+#ifdef PADDLE_WITH_XPU
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    auto p = BOOST_GET_CONST(platform::XPUPlace, tensor.place());
+    paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p,
+                         tensor_buf_ptr, copy_bytes);
+    return py_arr;
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use XPUPlace in CPU/GPU version, "
+        "Please recompile or reinstall Paddle with XPU support."));
+#endif
+  } else if (is_gpu_tensor) {
 #ifdef PADDLE_WITH_CUDA
-  py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
-  PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
-                    platform::errors::InvalidArgument(
-                        "PyArray is not writable, in which case memory leak "
-                        "or double free would occur"));
-  PADDLE_ENFORCE_EQ(py_arr.owndata(), true,
-                    platform::errors::InvalidArgument(
-                        "PyArray does not own data, in which case  memory leak "
-                        "or double free would occur"));
-
-  size_t copy_bytes = sizeof_dtype * numel;
-  paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr,
-                                  copy_bytes, cudaMemcpyDeviceToHost);
-  return py_arr;
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr,
+                                    copy_bytes, cudaMemcpyDeviceToHost);
+    return py_arr;
 #else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "Cannot use CUDAPlace in CPU only version, "
-      "Please recompile or reinstall Paddle with CUDA support."));
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use CUDAPlace in CPU only version, "
+        "Please recompile or reinstall Paddle with CUDA support."));
 #endif
+  }
+  PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
+  return py::array();
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index 04870f87c40dd305304579a454cb618bf1446e39..1f88eb2109aa23b6b60104451908b0a70c41c898 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -29,6 +29,8 @@ function(train_test TARGET_NAME)
                 PROPERTIES DEPENDS test_${TARGET_NAME})
         set_tests_properties(test_train_${TARGET_NAME}${arg}
                 PROPERTIES LABELS "RUN_TYPE=DIST")
+        set_tests_properties(test_train_${TARGET_NAME}${arg}
+                PROPERTIES TIMEOUT 150)
     endforeach()
 endfunction(train_test)
 
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
index 39db5a601d3d46c106a574870f02434bd4bd5cd1..d7a86b653bec44c260a845d454c771ec4440993b 100644
--- a/paddle/scripts/README.md
+++ b/paddle/scripts/README.md
@@ -70,7 +70,6 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_STYLE_CHECK` | ON | Check the code style when building. |
 | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
 | `RUN_TEST` | OFF | Run unit test immediently after the build. |
-| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
 
 ## Docker Images
 
@@ -155,21 +154,6 @@ docker push
 kubectl ...
 ```
 
-### Reading source code with woboq codebrowser
-
-For developers who are interested in the C++ source code, you can build C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
-
-- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
-
-```bash
-./paddle/scripts/paddle_docker_build.sh html
-```
-
-- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
-
-```
-docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx
-```
 
 ## More Options
 
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index d3f3fc79c4d971142c11645fc59d48f5a633b169..862ab2e8db1fdc353db826204d759d99951d5142 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -28,21 +28,24 @@ rem ------initialize the virtual environment------
 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
 set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
 
-set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
-%PYTHON_EXECUTABLE% -m pip install virtualenv
-%PYTHON_EXECUTABLE% -m virtualenv paddle_winci
-call paddle_winci\Scripts\activate.bat
+rem ToDo: virtual environment can't be deleted safely, some process not exit when task is canceled
+rem Now use system python environment temporarily
+rem set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
+rem %PYTHON_EXECUTABLE% -m pip install virtualenv
+rem %PYTHON_EXECUTABLE% -m virtualenv paddle_winci
+rem call paddle_winci\Scripts\activate.bat
 
 rem ------pre install requirement----------
 where python
 where pip
-pip install --upgrade pip
-pip install wheel
-pip install gym
-pip install -U -r %work_dir%\python\requirements.txt
+pip install --upgrade pip --user
+pip install wheel --user
+pip install gym --user
+pip install -U -r %work_dir%\python\requirements.txt --user
 if %ERRORLEVEL% NEQ 0 (
-    call paddle_winci\Scripts\deactivate.bat
-    exit /b %ERRORLEVEL%
+    call paddle_winci\Scripts\deactivate.bat 2>NUL
+    echo pip install requirements.txt failed!
+    exit /b 7
 )
 
 rem ------initialize common variable------
@@ -122,14 +125,21 @@ echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
 
-echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH%
-cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH%
+echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
+-DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
+
+cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
+-DON_INFER=%ON_INFER%  -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
 goto:eof
 
 :cmake_error
-call paddle_winci\Scripts\deactivate.bat
+call paddle_winci\Scripts\deactivate.bat 2>NUL
 echo Cmake failed, will exit!
-exit /b 1
+exit /b 7
 
 rem ---------------------------------------------------------------------------------------------
 :build
@@ -140,12 +150,12 @@ call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd6
 
 set build_times=1
 :build_tp
-echo Build third_party the %build_times% time:
+echo Build third_party for %build_times% time:
 msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1  
     if %build_times% GTR 3 (
-        exit /b 1
+        exit /b 7
     ) else (
         echo Build third_party failed, will retry!
         goto :build_tp
@@ -155,12 +165,12 @@ echo Build third_party successfully!
 
 set build_times=1
 :build_paddle
-echo Build Paddle the %build_times% time:
+echo Build Paddle for %build_times% time:
 msbuild /m /p:Configuration=Release /verbosity:minimal paddle.sln
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
     if %build_times% GTR 2 (
-        exit /b 1
+        exit /b 7
     ) else (
         echo Build Paddle failed, will retry!
         goto :build_paddle
@@ -170,7 +180,7 @@ echo Build Paddle successfully!
 goto:eof
 
 :build_error
-call paddle_winci\Scripts\deactivate.bat
+call paddle_winci\Scripts\deactivate.bat 2>NUL
 echo Build Paddle failed, will exit!
 exit /b 7
 
@@ -182,16 +192,21 @@ echo    ========================================
 dir /s /b python\dist\*.whl > whl_file.txt
 set /p PADDLE_WHL_FILE_WIN=< whl_file.txt
 
-pip install -U %PADDLE_WHL_FILE_WIN%
-if %ERRORLEVEL% NEQ 0 exit /b %ERRORLEVEL%
+pip uninstall -y paddlepaddle
+pip uninstall -y paddlepaddle-gpu
+pip install -U %PADDLE_WHL_FILE_WIN% --user
+if %ERRORLEVEL% NEQ 0 (
+    call paddle_winci\Scripts\deactivate.bat 2>NUL
+    echo pip install whl package failed!
+    exit /b 3
+)
 
-echo import paddle.fluid;print(paddle.__version__) > test_whl.py
-python test_whl.py
+python %work_dir%\paddle\scripts\installation_validate.py
 goto:eof
 
 :test_whl_pacakage_error
-call paddle_winci\Scripts\deactivate.bat
-echo Pip install whl package failed, will exit!
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo Test import paddle failed, will exit!
 exit /b 3
 
 rem ---------------------------------------------------------------------------------------------
@@ -208,11 +223,11 @@ dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
-ctest.exe --output-on-failure -C Release -j 7
+ctest.exe --output-on-failure -C Release -j 8
 goto:eof
 
 :unit_test_error
-call paddle_winci\Scripts\deactivate.bat
+call paddle_winci\Scripts\deactivate.bat 2>NUL
 echo Running unit tests failed, will exit!
 exit /b 8
 
@@ -228,7 +243,7 @@ cd %work_dir%\paddle\fluid\inference\api\demo_ci
 goto:eof
 
 :test_inference_error
-call paddle_winci\Scripts\deactivate.bats
+call paddle_winci\Scripts\deactivate.bat 2>NUL
 echo Testing fluid library for inference failed!
 exit /b 5
 
@@ -268,7 +283,10 @@ echo     git fetch upstream $BRANCH # develop is not fetched>>  check_change_of_
 echo fi>>  check_change_of_unittest.sh
 echo git checkout -b origin_pr >>  check_change_of_unittest.sh
 echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
-echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% >>  check_change_of_unittest.sh
+echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
+-DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% >>  check_change_of_unittest.sh
 echo cat ^<^<EOF>>  check_change_of_unittest.sh
 echo     ============================================       >>  check_change_of_unittest.sh
 echo     Generate unit tests.spec of develop.               >>  check_change_of_unittest.sh
@@ -303,8 +321,8 @@ echo git checkout -f origin_pr >>  check_change_of_unittest.sh
 goto:eof
 
 :check_change_of_unittest_error
-call paddle_winci\Scripts\deactivate.bat
-exit /b 6
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+exit /b 1
 
 
 rem ---------------------------------------------------------------------------------------------
@@ -322,7 +340,10 @@ taskkill /f /im git-remote-https.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
-call paddle_winci\Scripts\deactivate.bat
+taskkill /f /im %cd%\paddle\fluid\pybind\Release\op_function_generator.exe  2>NUL
+taskkill /f /im python.exe  2>NUL
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+taskkill /f /im python.exe  2>NUL
 echo Windows CI run successfully!
 exit /b 0
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 4bd93da72ef682cedf585871dc764da2a4da32c5..a77d605eb6c26b02c38a58195d1f8f1e84a3dc20 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -529,13 +529,16 @@ EOF
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
         ut_startTime_s=`date +%s`
-        ctest --output-on-failure -j $2
+        ctest --output-on-failure -j $2;mactest_error=$?
         ut_endTime_s=`date +%s`
         echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
         paddle version
         # Recovery proxy to avoid failure in later steps
         export http_proxy=$my_proxy
         export https_proxy=$my_proxy
+        if [ "$mactest_error" != 0 ];then
+            exit 8;
+        fi
     fi
 }
 
@@ -570,6 +573,7 @@ function generate_upstream_develop_api_spec() {
 }
 
 function generate_api_spec() {
+    set -e
     spec_kind=$2
     if [ "$spec_kind" != "PR" ] && [ "$spec_kind" != "DEV" ]; then
         echo "Not supported $2"
@@ -580,7 +584,8 @@ function generate_api_spec() {
     cd ${PADDLE_ROOT}/build/.check_api_workspace
     virtualenv .${spec_kind}_env
     source .${spec_kind}_env/bin/activate
-    pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    pip install -r ${PADDLE_ROOT}/python/requirements.txt
+    pip --no-cache-dir install ${PADDLE_ROOT}/build/python/dist/*whl
     spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path
 
@@ -875,6 +880,7 @@ set +x
         multiple_card_tests=''    # cases list which would take multiple GPUs, most cases would be two GPUs
         is_exclusive=''           # indicate whether the case is exclusive type
         is_multicard=''           # indicate whether the case is multiple GPUs type
+        is_nightly=''             # indicate whether the case will only run at night
         while read -r line; do
             if [[ "$line" == "" ]]; then
                 continue
@@ -884,12 +890,19 @@ set +x
                     # Any test case with LABELS property would be parse here
                     # RUN_TYPE=EXCLUSIVE mean the case would run exclusively
                     # RUN_TYPE=DIST mean the case would take two graph GPUs during runtime
+                    # RUN_TYPE=NIGHTLY or RUN_TYPE=DIST:NIGHTLY or RUN_TYPE=EXCLUSIVE:NIGHTLY means the case will ONLY run at night
                     read is_exclusive <<< $(echo "$line"|grep -oEi "RUN_TYPE=EXCLUSIVE")
                     read is_multicard <<< $(echo "$line"|grep -oEi "RUN_TYPE=DIST")
+                    read is_nightly <<< $(echo "$line"|grep -oEi "RUN_TYPE=NIGHTLY|RUN_TYPE=DIST:NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY")
                     continue
                 fi
                 read testcase <<< $(echo "$line"|grep -oEi "\w+$")
 
+                if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then
+                    echo $testcase" will only run at night."
+                    continue
+                fi
+
                 if [[ "$is_multicard" == "" ]]; then
                   # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
                   read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist")
@@ -908,7 +921,7 @@ set +x
                         multiple_card_tests="$multiple_card_tests|^$testcase$"
                     fi
                 else
-                    if [[ "${#single_card_tests}" -gt 3000 ]];then
+                    if [[ "${#single_card_tests}" -gt 10000 ]];then
                         if [[ "$single_card_tests_1" == "" ]]; then 
                             single_card_tests_1="^$testcase$"
                         else
@@ -925,6 +938,7 @@ set +x
                 fi
                 is_exclusive=''
                 is_multicard=''
+                is_nightly=''
                 matchstr=''
                 testcase=''
         done <<< "$test_cases";
@@ -934,17 +948,96 @@ set +x
         card_test "$multiple_card_tests" 2  # run cases with two GPUs
         card_test "$exclusive_tests"        # run cases exclusively, in this cases would be run with 4/8 GPUs
         collect_failed_tests
-        if [ -n "${failed_test_lists}" ];then
-            failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
-            echo "========================================"
-            echo "Summary Failed Tests... "
-            echo "========================================"
-            echo "The following tests FAILED: "
-            echo "${failed_test_lists_ult}"
-        fi
         rm -f $tmp_dir/*
+        exec_times=0
+        retry_unittests_record=''
+        retry_time=3
+        exec_time_array=('first' 'second' 'third')
+        if [ -n "$failed_test_lists" ];then
+            while ( [ $exec_times -lt $retry_time ] && [ -n "${failed_test_lists}" ] )
+                do
+                    
+                    retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                    failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(\w+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                    echo "========================================="
+                    echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                    echo "========================================="
+                    echo "The following unittest will be re-run:"
+                    echo "${failed_test_lists_ult}"
+                        
+                    for line in ${retry_unittests[@]} ;
+                        do
+
+                            one_card_tests=$single_card_tests'|'$single_card_tests_1
+
+                            read tmp_one_tmp <<< "$( echo $one_card_tests | grep -oEi $line )"
+                            read tmp_mul_tmp <<< "$( echo $multiple_card_tests | grep -oEi $line )"
+                            read exclusive_tmp <<< "$( echo $exclusive_tests | grep -oEi $line )"
+
+                            if [[ "$tmp_one_tmp" != ""  ]]; then
+                                if [[ "$one_card_retry" == "" ]]; then
+                                    one_card_retry="^$line$"
+                                else
+                                    one_card_retry="$one_card_retry|^$line$"
+                                fi
+                            elif [[ "$tmp_mul_tmp" != "" ]]; then
+                                if [[ "$multiple_card_retry" == "" ]]; then
+                                    multiple_card_retry="^$line$"
+                                else
+                                    multiple_card_retry="$multiple_card_retry|^$line$"
+                                fi
+                            else
+                                if [[ "$exclusive_retry" == "" ]];then
+                                    exclusive_retry="^$line$"
+                                else
+                                    exclusive_retry="$exclusive_retry|^$line$"
+                                fi
+                            fi
+
+                        done
+
+                    if [[ "$one_card_retry" != "" ]]; then
+                        card_test "$one_card_retry" 1
+                    fi
+
+                    if [[ "$multiple_card_retry" != "" ]]; then
+                        card_test "$multiple_card_retry" 2
+                    fi
+
+                    if [[ "$exclusive_retry" != "" ]]; then
+                        card_test "$exclusive_retry"
+                    fi
+                    
+                    exec_times=$[$exec_times+1]
+                    failed_test_lists=''
+                    collect_failed_tests
+                    rm -f $tmp_dir/*
+                    one_card_retry=''
+                    multiple_card_retry=''
+                    exclusive_retry=''
+                    retry_unittests=''
+                done
+        fi
+
+
+       
         if [[ "$EXIT_CODE" != "0" ]]; then
-            exit 8;
+            if [[ "$failed_test_lists" == "" ]]; then
+                echo "========================================"
+                echo "There are failed tests, which have been successful after re-run:"
+                echo "========================================"
+                echo "The following tests have been re-ran:"
+                echo "${retry_unittests_record}"
+            else
+                failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                echo "========================================"
+                echo "Summary Failed Tests... "
+                echo "========================================"
+                echo "The following tests FAILED: "
+                echo "${failed_test_lists_ult}"
+                exit 8;
+            fi
         fi
 set -ex
     fi
@@ -1024,22 +1117,6 @@ EOF
       esac
 }
 
-function gen_html() {
-    cat <<EOF
-    ========================================
-    Converting C++ source code into HTML ...
-    ========================================
-EOF
-    export WOBOQ_OUT=${PADDLE_ROOT}/build/woboq_out
-    mkdir -p $WOBOQ_OUT
-    cp -rv /woboq/data $WOBOQ_OUT/../data
-    /woboq/generator/codebrowser_generator \
-    	-b ${PADDLE_ROOT}/build \
-    	-a \
-    	-o $WOBOQ_OUT \
-    	-p paddle:${PADDLE_ROOT}
-    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-}
 
 function gen_dockerfile() {
     # Set BASE_IMAGE according to env variables
@@ -1362,9 +1439,6 @@ function main() {
       gen_doc_lib)
         gen_doc_lib $2
         ;;
-      html)
-        gen_html
-        ;;
       dockerfile)
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b8d6e8516a67785d9c9e6e75314e813c884c43d0..779a6842ebb03e2afcdb7718f77eb9b0d2c09a83 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -91,6 +91,7 @@ set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 if (WITH_TESTING)
   add_subdirectory(paddle/reader/tests)
   add_subdirectory(paddle/dataset/tests)
+  add_subdirectory(paddle/tests)
   add_subdirectory(paddle/fluid/tests)
   add_subdirectory(paddle/fluid/contrib/tests)
   add_subdirectory(paddle/fluid/contrib/slim/tests)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
old mode 100644
new mode 100755
index 518e2c0c4d90daec12c3b924ace10fbd667c22ae..46b84697e5a61e164cbc826d5018db7a6d87f69f
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -31,6 +31,10 @@ import paddle.reader
 import paddle.dataset
 import paddle.batch
 batch = batch.batch
+from .fluid import monkey_patch_variable
+from .fluid.dygraph import monkey_patch_math_varbase
+monkey_patch_variable()
+monkey_patch_math_varbase()
 import paddle.framework
 from .framework import VarBase as Tensor
 from .framework import ComplexVariable as ComplexTensor
@@ -38,6 +42,7 @@ import paddle.compat
 import paddle.distributed
 import paddle.sysconfig
 import paddle.tensor
+import paddle.distribution
 import paddle.nn
 import paddle.distributed.fleet
 import paddle.optimizer
@@ -48,6 +53,7 @@ import paddle.incubate.complex as complex
 # TODO: define alias in tensor and framework directory
 
 from .tensor.random import randperm
+from .tensor.random import bernoulli
 
 from .tensor.attribute import rank  #DEFINE_ALIAS
 from .tensor.attribute import shape  #DEFINE_ALIAS
@@ -85,7 +91,7 @@ from .tensor.logic import equal  #DEFINE_ALIAS
 from .tensor.logic import greater_equal  #DEFINE_ALIAS
 from .tensor.logic import greater_than  #DEFINE_ALIAS
 from .tensor.logic import is_empty  #DEFINE_ALIAS
-from .tensor.logic import isfinite  #DEFINE_ALIAS
+#from .tensor.logic import isfinite  #DEFINE_ALIAS
 from .tensor.logic import less_equal  #DEFINE_ALIAS
 from .tensor.logic import less_than  #DEFINE_ALIAS
 from .tensor.logic import logical_and  #DEFINE_ALIAS
@@ -170,7 +176,11 @@ from .tensor.math import maximum  #DEFINE_ALIAS
 from .tensor.math import min  #DEFINE_ALIAS
 from .tensor.math import minimum  #DEFINE_ALIAS
 from .tensor.math import mm  #DEFINE_ALIAS
-from .tensor.math import div  #DEFINE_ALIAS
+from .tensor.math import divide  #DEFINE_ALIAS
+from .tensor.math import floor_divide  #DEFINE_ALIAS
+from .tensor.math import remainder  #DEFINE_ALIAS
+from .tensor.math import mod  #DEFINE_ALIAS
+from .tensor.math import floor_mod  #DEFINE_ALIAS
 from .tensor.math import multiply  #DEFINE_ALIAS
 from .tensor.math import add  #DEFINE_ALIAS
 from .tensor.math import atan  #DEFINE_ALIAS
@@ -180,11 +190,16 @@ from .tensor.math import log1p  #DEFINE_ALIAS
 from .tensor.math import erf  #DEFINE_ALIAS
 from .tensor.math import addcmul  #DEFINE_ALIAS
 from .tensor.math import addmm  #DEFINE_ALIAS
-from .tensor.math import clamp  #DEFINE_ALIAS
+from .tensor.math import clip  #DEFINE_ALIAS
 from .tensor.math import trace  #DEFINE_ALIAS
 from .tensor.math import kron  #DEFINE_ALIAS
-# from .tensor.random import gaussin        #DEFINE_ALIAS
-# from .tensor.random import uniform        #DEFINE_ALIAS
+from .tensor.math import isfinite  #DEFINE_ALIAS
+from .tensor.math import isinf  #DEFINE_ALIAS
+from .tensor.math import isnan  #DEFINE_ALIAS
+from .tensor.math import prod  #DEFINE_ALIAS
+from .tensor.random import standard_normal
+from .tensor.random import normal
+from .tensor.random import uniform  #DEFINE_ALIAS
 from .tensor.random import shuffle  #DEFINE_ALIAS
 from .tensor.random import randn  #DEFINE_ALIAS
 from .tensor.random import rand  #DEFINE_ALIAS
@@ -195,7 +210,7 @@ from .tensor.search import argmin  #DEFINE_ALIAS
 from .tensor.search import argsort  #DEFINE_ALIAS
 from .tensor.search import has_inf  #DEFINE_ALIAS
 from .tensor.search import has_nan  #DEFINE_ALIAS
-# from .tensor.search import masked_select        #DEFINE_ALIAS
+from .tensor.search import masked_select  #DEFINE_ALIAS
 from .tensor.search import topk  #DEFINE_ALIAS
 from .tensor.search import where  #DEFINE_ALIAS
 from .tensor.search import index_select  #DEFINE_ALIAS
@@ -210,14 +225,11 @@ from .framework import CPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPlace  #DEFINE_ALIAS
 from .framework import CUDAPinnedPlace  #DEFINE_ALIAS
 
-from .framework import BackwardStrategy  #DEFINE_ALIAS
 from .framework import to_variable  #DEFINE_ALIAS
 from .framework import grad  #DEFINE_ALIAS
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
-from .framework import prepare_context  #DEFINE_ALIAS
-from .framework import ParallelEnv  #DEFINE_ALIAS
 from .framework import DataParallel  #DEFINE_ALIAS
 
 from .framework import NoamDecay  #DEFINE_ALIAS
@@ -236,6 +248,7 @@ from .tensor.stat import reduce_mean  #DEFINE_ALIAS
 from .tensor.stat import std  #DEFINE_ALIAS
 from .tensor.stat import var  #DEFINE_ALIAS
 from .fluid.data import data
+from .tensor.stat import numel  #DEFINE_ALIAS
 from .device import get_cudnn_version
 from .device import set_device
 from .device import get_device
diff --git a/python/paddle/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
index bb9830132e987370022df3192060de3e908a2e85..3540ea06b075ed9b649af803c5a655a1e737723b 100644
--- a/python/paddle/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -42,9 +42,11 @@ class TestSentimentMethods(unittest.TestCase):
     def test_data_set(self):
         data_set = st.load_sentiment_data()
         last_label = -1
+
         for each in st.test():
             self.assertNotEqual(each[1], last_label)
             last_label = each[1]
+
         self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
         self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
         self.assertEqual(
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index d0c32e26092f6ea25771279418582a24ea449ab2..d66577102c713ae796abdf812f1cf2bb767e5b14 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -11,3 +11,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from . import spawn
+from .spawn import spawn
+
+from . import parallel
+from .parallel import init_parallel_env
+from .parallel import get_rank
+from .parallel import get_world_size
+from paddle.fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
+from paddle.fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
+
+from . import collective
+from .collective import *
+
+# start multiprocess apis
+__all__ = ["spawn"]
+
+# dygraph parallel apis
+__all__ += [
+    "init_parallel_env", "get_rank", "get_world_size", "prepare_context",
+    "ParallelEnv"
+]
+
+# collective apis
+__all__ += collective.__all__
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
new file mode 100644
index 0000000000000000000000000000000000000000..c40ae7179395a2fc16ece0d68546221ce53c2180
--- /dev/null
+++ b/python/paddle/distributed/collective.py
@@ -0,0 +1,447 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+from ..fluid.layer_helper import LayerHelper
+from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_
+from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..fluid.layers.tensor import fill_constant
+from ..fluid.layers import utils
+from ..fluid.dygraph.parallel import prepare_context
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+__all__ = [
+    'broadcast',
+    'all_reduce',
+    'reduce',
+    'all_gather',
+    'scatter',
+    'barrier',
+    'ReduceOp',
+]
+
+
+class ReduceOp:
+    """Reduce Operation"""
+    SUM = 0
+    MAX = 1
+    MIN = 2
+    PROD = 3
+
+
+class _Group():
+    """The abstract representation of group."""
+
+    def __init__(self, rank, rank_num):
+        self.rank = rank
+        self.nranks = rank_num
+
+
+_default_group = _Group(
+    int(os.getenv("PADDLE_TRAINER_ID", "0")),
+    int(os.getenv("PADDLE_TRAINERS_NUM", "1")))
+
+
+def broadcast(tensor, src, group=0):
+    """
+
+    Broadcast a tensor from the source to all others.
+
+    Args:
+        tensor (Tensor): The Tensor to send if current rank is the source, or the tensor to receive otherwise. Its data type
+            should be float16, float32, float64, int32 or int64.
+        src (int): The source rank.
+        group (int): The process group to work on. It is Optional.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data = np.array([[4, 5, 6], [4, 5, 6]])
+        else:
+            np_data = np.array([[1, 2, 3], [1, 2, 3]])
+        data = paddle.to_tensor(np_data)
+        paddle.distributed.broadcast(data, 1)
+        out = data.numpy()
+        # [[1, 2, 3], [1, 2, 3]]
+    """
+    if in_dygraph_mode():
+        return core.ops.c_broadcast(tensor, tensor, 'root', src,
+                                    'use_calc_stream', True, 'ring_id', group)
+
+    op_type = 'c_broadcast'
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'broadcast')
+    if not isinstance(src, int) or not isinstance(group, int):
+        raise ValueError("Both the type of 'src' and 'group' for broadcast "
+                         "should be int.")
+
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'root': src,
+            'use_calc_stream': True,
+            'ring_id': group,
+        })
+
+
+def all_reduce(tensor, op=ReduceOp.SUM, group=0):
+    """
+
+    Reduce a tensor over all ranks so that all get the result.
+
+    Args:
+        tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used.
+        group (int): Optional. The process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        from paddle.distributed import ReduceOp
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data = np.array([[4, 5, 6], [4, 5, 6]])
+        else:
+            np_data = np.array([[1, 2, 3], [1, 2, 3]])
+        data = paddle.to_tensor(np_data)
+        paddle.distributed.all_reduce(data)
+        out = data.numpy()
+        # [[5, 7, 9], [5, 7, 9]]
+    """
+    if in_dygraph_mode():
+        if op == ReduceOp.SUM:
+            return core.ops.c_allreduce_sum(tensor, tensor, 'use_calc_stream',
+                                            True, 'ring_id', group)
+        elif op == ReduceOp.MAX:
+            return core.ops.c_allreduce_max(tensor, tensor, 'use_calc_stream',
+                                            True, 'ring_id', group)
+        elif op == ReduceOp.MIN:
+            return core.ops.c_allreduce_min(tensor, tensor, 'use_calc_stream',
+                                            True, 'ring_id', group)
+        elif op == ReduceOp.PROD:
+            return core.ops.c_allreduce_prod(tensor, tensor, 'use_calc_stream',
+                                             True, 'ring_id', group)
+        else:
+            raise ValueError("Unknown parameter: {}.".format(op))
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'all_reduce')
+    if not op in [ReduceOp.SUM, ReduceOp.MAX, ReduceOp.MIN, ReduceOp.PROD]:
+        raise ValueError("The op for all_reduce must be one of educeOp.PROD, "
+                         "ReduceOp.SUM, ReduceOp.MAX, ReduceOp.MIN.")
+    if op == ReduceOp.SUM:
+        op_type = 'c_allreduce_sum'
+    elif op == ReduceOp.MAX:
+        op_type = 'c_allreduce_max'
+    elif op == ReduceOp.MIN:
+        op_type = 'c_allreduce_min'
+    elif op == ReduceOp.PROD:
+        op_type = 'c_allreduce_prod'
+    if not isinstance(group, int):
+        raise ValueError("The type of 'group' for all_reduce should be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={'ring_id': group,
+               'use_calc_stream': True})
+
+
+def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
+    """
+
+    Reduce a tensor to the destination from all others.
+
+    Args:
+        tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
+            should be float16, float32, float64, int32 or int64.
+        dst (int): The destination rank id.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data = np.array([[4, 5, 6], [4, 5, 6]])
+        else:
+            np_data = np.array([[1, 2, 3], [1, 2, 3]])
+        data = paddle.to_tensor(np_data)
+        paddle.distributed.reduce(data, 0)
+        out = data.numpy()
+        # [[5, 7, 9], [5, 7, 9]]
+    """
+    if in_dygraph_mode():
+        if op == ReduceOp.SUM:
+            return core.ops.c_reduce_sum(tensor, tensor, 'use_calc_stream',
+                                         True, 'ring_id', group, 'root_id', dst)
+        elif op == ReduceOp.MAX:
+            return core.ops.c_reduce_max(tensor, tensor, 'use_calc_stream',
+                                         True, 'ring_id', group, 'root_id', dst)
+        elif op == ReduceOp.MIN:
+            return core.ops.c_reduce_min(tensor, tensor, 'use_calc_stream',
+                                         True, 'ring_id', group, 'root_id', dst)
+        elif op == ReduceOp.PROD:
+            return core.ops.c_reduce_prod(tensor, tensor, 'use_calc_stream',
+                                          True, 'ring_id', group, 'root_id',
+                                          dst)
+        else:
+            raise ValueError("Unknown parameter: {}.".format(op))
+
+    op_type = 'c_reduce'
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'all_reduce')
+    if not op in [ReduceOp.SUM, ReduceOp.MAX, ReduceOp.MIN, ReduceOp.PROD]:
+        raise ValueError("The op for reduce must be one of educeOp.PROD, "
+                         "ReduceOp.SUM, ReduceOp.MAX, ReduceOp.MIN.")
+
+    if op == ReduceOp.SUM:
+        op_type = 'c_reduce_sum'
+    elif op == ReduceOp.MAX:
+        op_type = 'c_reduce_max'
+    elif op == ReduceOp.MIN:
+        op_type = 'c_reduce_min'
+    elif op == ReduceOp.PROD:
+        op_type = 'c_reduce_prod'
+
+    if not isinstance(dst, int) or not isinstance(group, int):
+        raise ValueError("Both the type of 'dst' and 'group' for reduce "
+                         "should be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': group,
+            'use_calc_stream': True,
+            'root_id': dst,
+        })
+
+
+def all_gather(tensor_list, tensor, group=0):
+    """
+
+    Gather tensors from all participators and all get the result.
+
+    Args:
+        tensor_list (list): A list of output Tensors. Every element in the list must be a Tensor whose data type
+            should be float16, float32, float64, int32 or int64.
+        tensor (Tensor): The Tensor to send. Its data type
+            should be float16, float32, float64, int32 or int64.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        tensor_list = []
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data1 = np.array([[4, 5, 6], [4, 5, 6]])
+            np_data2 = np.array([[4, 5, 6], [4, 5, 6]])
+            data1 = paddle.to_tensor(np_data1)
+            data2 = paddle.to_tensor(np_data2)
+            paddle.distributed.all_gather(tensor_list, data1)
+        else:
+            np_data1 = np.array([[1, 2, 3], [1, 2, 3]])
+            np_data2 = np.array([[1, 2, 3], [1, 2, 3]])
+            data1 = paddle.to_tensor(np_data1)
+            data2 = paddle.to_tensor(np_data2)
+            out = paddle.distributed.all_gather(tensor_list, data2)
+    """
+    op_type = 'c_allgather'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+    if in_dygraph_mode():
+        core.ops.c_allgather(tensor, out, 'use_calc_stream', True, 'ring_id',
+                             group, 'nranks', _default_group.nranks)
+    else:
+        if not isinstance(tensor_list, list):
+            raise ValueError("The type of 'tensor_list' for all_gather "
+                             "should be list.")
+        for elem in tensor_list:
+            check_variable_and_dtype(
+                elem, 'tensor_list',
+                ['float16', 'float32', 'float64', 'int32', 'int64'],
+                'all_gather')
+        check_variable_and_dtype(
+            tensor, 'tensor',
+            ['float16', 'float32', 'float64', 'int32', 'int64'], 'all_gather')
+        if not isinstance(group, int):
+            raise ValueError("The type of 'group' for all_gather "
+                             "should be int.")
+        helper.append_op(
+            type=op_type,
+            inputs={'X': [tensor]},
+            outputs={'Out': [out]},
+            attrs={
+                'ring_id': group,
+                'use_calc_stream': True,
+                'nranks': _default_group.nranks
+            })
+
+    tensor_list.extend(paddle.split(out, _default_group.nranks, 0))
+
+
+def scatter(tensor, tensor_list=None, src=0, group=0):
+    """
+
+    Scatter a tensor to all participators.
+
+    Args:
+        tensor (Tensor): The output Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        tensor_list (list): A list of Tensors to scatter. Every element in the list must be a Tensor whose data type
+            should be float16, float32, float64, int32 or int64.
+        src (int): The source rank id.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data1 = np.array([7, 8, 9])
+            np_data2 = np.array([10, 11, 12])
+        else:
+            np_data1 = np.array([1, 2, 3])
+            np_data2 = np.array([4, 5, 6])
+        data1 = paddle.to_tensor(np_data1)
+        data2 = paddle.to_tensor(np_data2)
+        if paddle.ParallelEnv().local_rank == 0:
+            paddle.distributed.scatter(data1, src=1)
+        else:
+            paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
+        out = data1.numpy()
+    """
+    op_type = 'c_scatter'
+    global _default_group
+    rank = _default_group.rank
+    nranks = _default_group.nranks
+    if rank != src:
+        tensor_list = []
+        for _ in range(nranks):
+            tensor_list.append(tensor)
+    temp = paddle.concat(tensor_list, axis=0)
+    if in_dygraph_mode():
+        return core.ops.c_scatter(temp, tensor, 'use_calc_stream', True,
+                                  'ring_id', group, 'nranks',
+                                  _default_group.nranks, 'root', src)
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'scatter')
+    if not isinstance(group, int) or not isinstance(src, int):
+        raise ValueError("Both the type of 'src' and 'group' for scatter "
+                         "should be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [temp]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': group,
+            'root': src,
+            'use_calc_stream': True,
+            'nranks': nranks,
+        })
+
+
+def barrier(group=0):
+    """
+
+    Barrier among all participators in the group.
+
+    Args:
+        group (int): The id of the process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        paddle.distributed.barrier()
+    """
+    op_type = 'barrier'
+    temp = paddle.fill_constant([1], dtype="int32", value="1")
+    if in_dygraph_mode():
+        return core.ops.barrier(temp, temp, 'ring_id', group)
+    if not isinstance(group, int):
+        raise ValueError("The type of 'group' for barrier must be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [temp]},
+        outputs={'Out': [temp]},
+        attrs={'ring_id': group})
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index e291fdf4ed975e340e6451d6ca5c9e07a79dfda3..b080fb17553d4a93a545f4ae781d786d82e26576 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -48,4 +48,6 @@ init_server = fleet.init_server
 run_server = fleet.run_server
 stop_worker = fleet.stop_worker
 distributed_optimizer = fleet.distributed_optimizer
+save_inference_model = fleet.save_inference_model
+save_persistables = fleet.save_persistables
 minimize = fleet.minimize
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 5531160d7c5032dfa434247ecdc737a56303f5f2..5e527ea03ab9c816948f343ac103672a751fdbc3 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -14,7 +14,7 @@
 
 import paddle
 from paddle.distributed.fleet.proto import distributed_strategy_pb2
-from paddle.fluid.framework import Variable
+from paddle.fluid.framework import Variable, set_flags, core
 import google.protobuf.text_format
 
 
@@ -307,6 +307,30 @@ class DistributedStrategy(object):
 
     @property
     def amp_configs(self):
+        """
+        Set automatic mixed precision training configurations. In general, amp has serveral configurable
+        settings that can be configured through a dict.
+
+        **Notes**:
+            **init_loss_scaling(float)**: The initial loss scaling factor. Default 32768.
+            **use_dynamic_loss_scaling(bool)**: Whether to use dynamic loss scaling. Default True.
+            **incr_every_n_steps(int)**: Increases loss scaling every n consecutive steps with finite gradients. Default 1000.
+            **decr_every_n_nan_or_inf(int)**: Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.
+            **incr_ratio(float)**: The multiplier to use when increasing the loss scaling. Default 2.0.
+            **decr_ratio(float)**: The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.
+            **custom_white_list(list[str])**: Users' custom white list which always execution fp16.
+            **custom_black_list(list[str])**: Users' custom black list which forbidden execution fp16.
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.amp = True
+            strategy.amp_configs = {
+                "init_loss_scaling": 32768,
+                "custom_white_list": ['conv2d']}
+        """
         return get_msg_dict(self.strategy.amp_configs)
 
     @amp_configs.setter
@@ -333,6 +357,17 @@ class DistributedStrategy(object):
 
     @property
     def sync_nccl_allreduce(self):
+        """
+        Indicating whether we are using synchronized all reduce in each communication thread
+        We note that system overhead is usually lower when sync_nccl_allreduce = True
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.sync_nccl_allreduce = True
+        """
         return self.strategy.sync_nccl_allreduce
 
     @sync_nccl_allreduce.setter
@@ -344,6 +379,18 @@ class DistributedStrategy(object):
 
     @property
     def use_hierarchical_allreduce(self):
+        """
+        Indicating whether we are using hierarchical allreduce in collective communication
+        Hierarchical allreduce often does allreduce within a certain node group and then do
+        allreduce among the leaders of each group
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.use_hierarchical_allreduce = True
+        """
         return self.strategy.use_hierarchical_allreduce
 
     @use_hierarchical_allreduce.setter
@@ -357,6 +404,17 @@ class DistributedStrategy(object):
 
     @property
     def hierarchical_allreduce_inter_nranks(self):
+        """
+        Number of ranks for low level node groups in hierarchical allreduce
+        Default value: number of GPU cards on each single GPU machine
+
+        Example:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.hierarchical_allreduce_inter_nranks = 8
+        """
         return self.strategy.hierarchical_allreduce_inter_nranks
 
     @hierarchical_allreduce_inter_nranks.setter
@@ -370,6 +428,19 @@ class DistributedStrategy(object):
 
     @property
     def sync_batch_norm(self):
+        """
+        Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes.
+        
+        Default value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.sync_batch_norm = True
+        """
+
         return self.strategy.sync_batch_norm
 
     @sync_batch_norm.setter
@@ -381,6 +452,17 @@ class DistributedStrategy(object):
 
     @property
     def fuse_all_reduce_ops(self):
+        """
+        Indicating whether we are using fuse_all_reduce_ops for gradient fusion during backward phase of training
+        Default value: True
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.fuse_all_reduce_ops = False
+        """
         return self.strategy.fuse_all_reduce_ops
 
     @fuse_all_reduce_ops.setter
@@ -392,6 +474,18 @@ class DistributedStrategy(object):
 
     @property
     def fuse_grad_size_in_MB(self):
+        """
+        Specifying the size of gradient to fuse in Mega-Bytes
+
+        Default value: 32
+
+        Examples:
+          .. code-block:: python
+        
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.fuse_grad_size_in_MB = 50
+        """
         return self.strategy.fuse_grad_size_in_MB
 
     @fuse_grad_size_in_MB.setter
@@ -416,6 +510,19 @@ class DistributedStrategy(object):
 
     @property
     def nccl_comm_num(self):
+        """
+        Specifying the number of NCCL communicator
+
+        Default value: 1
+
+        Examples:
+          .. code-block:: python
+        
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.nccl_comm_num = 2
+        """
+
         return self.strategy.nccl_comm_num
 
     @nccl_comm_num.setter
@@ -516,6 +623,20 @@ class DistributedStrategy(object):
 
     @property
     def localsgd(self):
+        """
+        Indicating whether we are using Local SGD training. For more details, please refer to
+        [Don't Use Large Mini-Batches, Use Local SGD](https://arxiv.org/pdf/1808.07217.pdf),
+
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.localsgd = True # by default this is false
+
+        """
         return self.strategy.localsgd
 
     @localsgd.setter
@@ -527,6 +648,28 @@ class DistributedStrategy(object):
 
     @property
     def localsgd_configs(self):
+        """
+        Set LocalSGD training configurations. LocalSGD has a configurable
+        setting that can be configured through a dict.
+
+        **Notes**:
+            **k_steps(int)**: The local steps for training before parameter
+                synchronization. Default 1. If strategy.auto is set True, the
+                local steps will be calculated automatically during training.
+                The algorithm is referenced in this paper: 
+                [Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD](https://arxiv.org/pdf/1810.08313.pdf).
+                In this case, k_steps indicates the first local steps which
+                is suggested setting to 1.
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.localsgd = True
+            strategy.localsgd_configs = {"k_steps": 4}
+        """
+
         return get_msg_dict(self.strategy.localsgd_configs)
 
     @localsgd_configs.setter
@@ -537,6 +680,20 @@ class DistributedStrategy(object):
 
     @property
     def dgc(self):
+        """
+        Indicating whether we are using Deep Gradient Compression training. For more details, please refer to
+        [Deep Gradient Compression](https://arxiv.org/abs/1712.01887).
+
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.dgc = True # by default this is false
+
+        """
         return self.strategy.dgc
 
     @dgc.setter
@@ -548,6 +705,28 @@ class DistributedStrategy(object):
 
     @property
     def dgc_configs(self):
+        """
+        Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable
+        settings that can be configured through a dict.
+
+        **Notes**:
+            **rampup_begin_step(int)**: The beginning step from which gradient compression is implemented. Default 0.
+            **rampup_step(int)**: Time steps used in sparsity warm-up periods. Default is 1.
+                For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100,
+                it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array
+                ends, it will use 0.999 then and after.
+            **sparsity(list[float])**: Get top important element from gradient tensor, the ratio is (1 - sparsity).
+                Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important
+                element will be transmitted.
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.dgc = True
+            strategy.dgc_configs = {"rampup_begin_step": 1252}
+        """
         return get_msg_dict(self.strategy.dgc_configs)
 
     @dgc_configs.setter
@@ -607,6 +786,20 @@ class DistributedStrategy(object):
 
     @property
     def lars(self):
+        """
+        Set lars configurations. lars is used to deal with the convergence problems when the global 
+        batch size is larger than 8k.  For more details, please refer to 
+        [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).
+
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lars = True # by default this is false
+        """
         return self.strategy.lars
 
     @lars.setter
@@ -618,6 +811,29 @@ class DistributedStrategy(object):
 
     @property
     def lars_configs(self):
+        """
+        Set Lars training configurations.
+
+        **Notes**:
+        **lars_coeff (float)**: trust ratio in lars formula.
+        **lars_weight_decay** (float): weight decay coefficient in lars formula.
+        **epsilon (float)**: argument is used to avoid potential devision-by-zero 
+        when compute the local lr; 
+        **exclude_from_weight_decay ([string])**: is a list of name strings of layers which
+        will be exclude from weight decay in lars formula.
+
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lars = True
+            strategy.lars_configs = {
+                        "lars_coeff": 0.01,
+                        "lars_weight_decay": 0.0005,
+                        "epsilon": 0,
+                        "exclude_from_weight_decay": ['batch_norm', '.b_0']
+                    }
+        """
         return get_msg_dict(self.strategy.lars_configs)
 
     @lars_configs.setter
@@ -627,6 +843,22 @@ class DistributedStrategy(object):
 
     @property
     def lamb(self):
+        """
+        Set lamb configurations. lamb is used to deal with the convergence problems for large 
+        batch size training, specially for attention-related model like BERT. For more details, 
+        please refer to 
+        [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962).
+
+        Default Value: False
+        
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lamb = True # by default this is false
+        """
+
         return self.strategy.lamb
 
     @lamb.setter
@@ -638,6 +870,24 @@ class DistributedStrategy(object):
 
     @property
     def lamb_configs(self):
+        """
+        Set Lars training configurations.
+
+        **Notes**:
+        **lamb_weight_decay** (float): weight decay coefficient in lamb formula.
+        **exclude_from_weight_decay ([string])**: is a list of name strings of layers which
+        will be exclude from weight decay in lamb formula.
+
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lamb = True
+            strategy.lamb_configs = {
+                    'lamb_weight_decay': 0.01,
+                    'exclude_from_weight_decay': [],
+                }
+        """
         return get_msg_dict(self.strategy.lamb_configs)
 
     @lamb_configs.setter
@@ -667,6 +917,68 @@ class DistributedStrategy(object):
         else:
             print("WARNING: auto should have value of bool type")
 
+    @property
+    def cudnn_exhaustive_search(self):
+        return self.strategy.cudnn_exhaustive_search
+
+    @cudnn_exhaustive_search.setter
+    def cudnn_exhaustive_search(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.cudnn_exhaustive_search = flag
+        else:
+            print(
+                "WARNING: cudnn_exhaustive_search should have value of bool type"
+            )
+
+    @property
+    def conv_workspace_size_limit(self):
+        return self.strategy.conv_workspace_size_limit
+
+    @conv_workspace_size_limit.setter
+    def conv_workspace_size_limit(self, value):
+        if isinstance(value, int):
+            self.strategy.conv_workspace_size_limit = value
+        else:
+            print(
+                "WARNING: conv_workspace_size_limit should have value of int type"
+            )
+
+    @property
+    def cudnn_batchnorm_spatial_persistent(self):
+        return self.strategy.cudnn_batchnorm_spatial_persistent
+
+    @cudnn_batchnorm_spatial_persistent.setter
+    def cudnn_batchnorm_spatial_persistent(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.cudnn_batchnorm_spatial_persistent = flag
+        else:
+            print(
+                "WARNING: cudnn_batchnorm_spatial_persistent should have value of bool type"
+            )
+
+    def _enable_env(self):
+        strategy = self.strategy
+        keys = [
+            "FLAGS_cudnn_batchnorm_spatial_persistent",
+            "FLAGS_conv_workspace_size_limit",
+            "FLAGS_cudnn_exhaustive_search",
+            "FLAGS_sync_nccl_allreduce",
+            "FLAGS_fuse_parameter_memory_size",
+            "FLAGS_fuse_parameter_groups_size",
+        ]
+        values = [
+            bool(strategy.cudnn_batchnorm_spatial_persistent),
+            int(strategy.conv_workspace_size_limit),
+            bool(strategy.cudnn_exhaustive_search),
+            bool(strategy.sync_nccl_allreduce),
+            int(strategy.fuse_grad_size_in_MB),
+            int(strategy.fuse_grad_size_in_TFLOPS),
+        ]
+
+        for i, key in enumerate(keys):
+            if core.globals().is_public(key):
+                core.globals()[key] = values[i]
+
     def __repr__(self):
         fields = self.strategy.DESCRIPTOR.fields
         for f in fields:
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 695fd01909c0e6eecb37b34a120f156fb5fed090..a6286bcca87fad1afddbd8af1e56dad05dab2578 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -14,15 +14,32 @@
 
 from __future__ import print_function
 import paddle
+from .role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker, RoleMakerBase
 from .strategy_compiler import StrategyCompiler
 from .distributed_strategy import DistributedStrategy
 from .meta_optimizer_factory import MetaOptimizerFactory
 from .runtime_factory import RuntimeFactory
 from .util_factory import UtilFactory
+from paddle.fluid.wrapped_decorator import wrap_decorator
 
 __all__ = ['Fleet']
 
 
+def _inited_runtime_handler_(func):
+    def __impl__(*args, **kwargs):
+        cls = args[0]
+
+        if cls._runtime_handle is None:
+            raise ValueError("Fleet can not find suitable runtime handler")
+
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
+inited_runtime_handler = wrap_decorator(_inited_runtime_handler_)
+
+
 class Fleet(object):
     """
     Unified API for distributed training of PaddlePaddle
@@ -58,10 +75,35 @@ class Fleet(object):
     def __init__(self):
         self._runtime_handle = None
         self._util = None
+        self._role_maker = None
+        self._is_collective = False
 
-    def init(self, role_maker):
-        self._role_maker = role_maker
+    def init(self, role_maker=None, is_collective=False):
+        """
+        Initialize role_maker in Fleet.
+
+        This function is responsible for the distributed architecture 
+        what you want to run your code behind,such as Transpiler,
+        Collective in PaddleCloudRoleMaker or UserDefinedRoleMaker 
+        
+        """
+        if isinstance(role_maker, RoleMakerBase):
+            self._role_maker = role_maker
+        elif role_maker == None:
+            if isinstance(is_collective, bool):
+                self._is_collective = is_collective
+                self._role_maker = PaddleCloudRoleMaker(
+                    is_collective=self._is_collective)
+            else:
+                raise ValueError(
+                    "Something wrong occurred, please check whether is_collective is bool value"
+                )
+        else:
+            raise ValueError(
+                "Something wrong occurred, please check whether rolemaker is instance of RoleMakerBase"
+            )
         self.strategy_compiler = StrategyCompiler()
+        return None
 
     def is_first_worker(self):
         """
@@ -182,34 +224,48 @@ class Fleet(object):
         """
         self._role_maker.barrier_worker()
 
+    @inited_runtime_handler
     def init_worker(self):
         """
         init worker
         """
-        assert self._runtime_handle is not None
         self._runtime_handle._init_worker()
 
+    @inited_runtime_handler
     def init_server(self, *args, **kwargs):
         """
         init server
         """
-        assert self._runtime_handle is not None
         self._runtime_handle._init_server(*args, **kwargs)
 
+    @inited_runtime_handler
     def run_server(self):
         """
         run server
         """
-        assert self._runtime_handle is not None
         self._runtime_handle._run_server()
 
+    @inited_runtime_handler
     def stop_worker(self):
         """
         stop worker
         """
-        assert self._runtime_handle is not None
         self._runtime_handle._stop_worker()
 
+    def save_inference_model(self,
+                             executor,
+                             dirname,
+                             feeded_var_names,
+                             target_vars,
+                             main_program=None,
+                             export_for_deployment=True):
+        self._runtime_handle._save_inference_model(
+            executor, dirname, feeded_var_names, target_vars, main_program,
+            export_for_deployment)
+
+    def save_persistables(self, executor, dirname, main_program=None):
+        self._runtime_handle._save_persistables(executor, dirname, main_program)
+
     def distributed_optimizer(self, optimizer, strategy=None):
         """
         distirbuted_optimizer
@@ -327,6 +383,7 @@ class Fleet(object):
         context["valid_strategy"] = valid_strategy
 
         self.valid_strategy = valid_strategy
+        self.valid_strategy._enable_env()
 
         optimize_ops = []
         params_grads = []
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 856290a05046d310d71ee5aecb2ecb62faf4ddfe..6aeeb4a2896ea1d20390e463937aa07d3edd0204 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -165,10 +165,10 @@ class RoleMakerBase(object):
 
 
 class PaddleCloudRoleMaker(RoleMakerBase):
-    def __init__(self, is_collective=False, init_gloo=True, **kwargs):
+    def __init__(self, is_collective=False, **kwargs):
         super(PaddleCloudRoleMaker, self).__init__()
         self._is_collective = is_collective
-        self._init_gloo = init_gloo
+        self._init_gloo = False  #default no init gloo
         self._kwargs = kwargs
 
         self._role_is_generated = False
@@ -204,30 +204,35 @@ class PaddleCloudRoleMaker(RoleMakerBase):
             self._prefix = os.getenv("SYS_JOB_ID", "")
 
     def _barrier(self, comm_world):
-        if comm_world:
+        if isinstance(comm_world, fluid.core.Gloo):
             comm_world.barrier()
+        else:
+            print("warning: must init Gloo before using _barrier() function")
 
     def _all_gather(self, comm_world, input):
-        if comm_world:
+        if isinstance(comm_world, fluid.core.Gloo):
             self._barrier(comm_world)
             output = comm_world.all_gather(input)
             return output
         else:
+            print("warning: must init Gloo before using _all_gather() function")
             return None
 
     def _all_reduce(self, comm_world, input, mode="sum"):
-        if not comm_world:
-            return None
+        if isinstance(comm_world, fluid.core.Gloo):
 
-        input = np.array(input)
+            input = np.array(input)
 
-        input_shape = input.shape
-        input_list = input.reshape(-1).tolist()
+            input_shape = input.shape
+            input_list = input.reshape(-1).tolist()
 
-        self._barrier(comm_world)
-        ans = comm_world.all_reduce(input_list, mode)
-        output = np.array(ans).reshape(input_shape)
-        return output
+            self._barrier(comm_world)
+            ans = comm_world.all_reduce(input_list, mode)
+            output = np.array(ans).reshape(input_shape)
+            return output
+        else:
+            print("warning: must init Gloo before using _all_reduce() function")
+            return None
 
     def is_worker(self):
         """
@@ -460,6 +465,8 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         if not self._role_is_generated:
             if not self._is_collective:
                 self._ps_env()
+                if "PADDLE_WITH_GLOO" in os.environ:
+                    self._init_gloo = bool(os.environ["PADDLE_WITH_GLOO"])
                 if self._init_gloo:
                     self._init_gloo_env()
             else:
diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py
index a5ff247a21f87df3ef0f64a35015b2572500d63f..4097fc1237f8d7616101810f994c243dffb2cd67 100644
--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
@@ -114,4 +114,18 @@ class StrategyCompiler(StrategyCompilerBase):
                 0]
             return_graph = None if graph_optimizers == None else graph_optimizers[
                 0]
+
+            if meta_optimizers == None or graph_optimizers == None:
+                return return_meta, return_graph
+
+            # do heuristic filter here, if any meta optimizer in graph optimizers is in 
+            # any meta optimizers' black list, set return_graph to None
+            need_graph_opt = True
+            for graph_opt in graph_optimizers:
+                for program_opt in meta_optimizers:
+                    if graph_opt.__class__.__name__ in program_opt.meta_optimizers_black_list:
+                        need_graph_opt = False
+            if not need_graph_opt:
+                return_graph = None
+
             return return_meta, return_graph
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index 6b1756c3695166c633560efa0486fb6d8021baba..66db14209b4c57475c30c6dde083593e27f04ea0 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -23,7 +23,12 @@ class AMPOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         self.amp_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = [
+            "LarsOptimizer", "LambOptimizer", "RecomputeOptimizer",
+            "LocalSGDOptimizer", "GradientMergeOptimizer",
+            "GraphExecutionOptimizer"
+        ]
+        self.meta_optimizers_black_list = ["DGCOptimizer"]
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 361175a11c51343001323aa9756b7a6d5759158c..f34786f9dc309dd1f03319368bbc93ef1bfc03e3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -25,6 +25,7 @@ class DGCOptimizer(MetaOptimizerBase):
         self.dgc_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index 28cbce317a9ebb524286665c3719d39b2323671e..bd52179a35862241768ad5bd01eedf16732ad3b6 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -16,13 +16,20 @@ from .meta_optimizer_base import MetaOptimizerBase
 
 __all__ = ["GradientMergeOptimizer"]
 
+# amp + gradient merge + lamb
+
 
 class GradientMergeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(GradientMergeOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         self.wrapped_opt = GM(optimizer)
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = [
+            "LarsOptimizer",
+            "LambOptimizer",
+            "GraphExecutionOptimizer",
+        ]
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index b9ff31a068e7f397b04c51fab53ae134a78b3bb5..ace31687338f918ef260b3134b0bd429795542d0 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -25,6 +25,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = []
 
     def _is_graph_out(self):
         return True
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index d9a31c17e0d198c30f7f3ce1a98770b0d7329080..7e08a02eb1dc2e14b1871fe7743bbee8ade3feb3 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -25,7 +25,8 @@ class LambOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         self.lamb_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = ["GraphExecutionOptimizer"]
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index a54a4fc55998057ac9fc2ac4ca418829a86b7a58..09c418fa79106d05cffae1e8bc18fac9c0cc8f34 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -24,7 +24,8 @@ class LarsOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         self.lars_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = ["GraphExecutionOptimizer"]
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index c807815ff465a0e2cfa08427b9b20cda27d93e0f..e22127c13999bfde7aa753ad1a66536913ab04f9 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -25,6 +25,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         super(LocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = ["GraphExecutionOptimizer"]
         self.snapshot_key = '@SNAPSHOT'
 
     def _can_apply(self):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index 04800cefdda570377fc5f3dbee093f6ba5fe2695..12a4d904340337bf9a99968c7d82db117bf59ce8 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -14,10 +14,16 @@
 
 __all__ = ["MetaOptimizerBase"]
 
+from paddle.fluid.optimizer import Optimizer
 
-class MetaOptimizerBase(object):
+
+class MetaOptimizerBase(Optimizer):
     def __init__(self, optimizer):
-        pass
+        self.inner_opt = optimizer
+        self._learning_rate = self.inner_opt._learning_rate
+        self._learning_rate_map = self.inner_opt._learning_rate_map
+        self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -26,7 +32,7 @@ class MetaOptimizerBase(object):
         self.user_defined_optimizer = user_defined_optimizer
         self.user_defined_strategy = user_defined_strategy
 
-    def _update_inner_optimier(self, optimizer):
+    def _update_inner_optimizer(self, optimizer):
         self.inner_opt = optimizer
 
     def _can_apply(self):
@@ -44,12 +50,37 @@ class MetaOptimizerBase(object):
         raise NotImplementedError("you should implement disable strategy in {}".
                                   format(type(self).__name__))
 
+    def apply_gradients(self, params_grads):
+        return self.inner_opt.apply_gradients(params_grads=params_grads)
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        return self.inner_opt.backward(loss, startup_program, parameter_list,
+                                       no_grad_set, callbacks)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.inner_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
-        raise NotImplementedError("meta optimizer not implemented")
+        params_grads = self.backward(
+            loss,
+            startup_program=startup_program,
+            parameter_list=parameter_list,
+            no_grad_set=no_grad_set)
+
+        optimize_ops = self.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
+        return optimize_ops, params_grads
 
     def minimize(self,
                  loss,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index 8a0c48aa544821cbc484df66cc11ff124d7b6362..fe9221307cbacfa1beaf030b70a4e4b9223769cc 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -95,6 +95,7 @@ class PipelineOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 07b69f19e7ebddbc2a595504371155d4ea2eea7b..45130b447125f6ecbade2e4e5e3dad2f127fda52 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -24,7 +24,13 @@ class RecomputeOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         self.wrapped_opt = RO(optimizer)
         # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = [
+            "LarsOptimizer",
+            "LambOptimizer",
+            "GradientMergeOptimizer",
+            "GraphExecutionOptimizer",
+        ]
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 813649edbcba7000cfb259888b877dcbdab05f1f..c731ed08893348d0be604eb383905cd4a9d6e228 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -13,11 +13,14 @@
 # limitations under the License.
 
 import os
-import logging
 import warnings
 
 import paddle.fluid as fluid
 from paddle.fluid import core
+from paddle.fluid.framework import Program
+from paddle.fluid.compiler import CompiledProgram
+from paddle.fluid.executor import Executor
+from paddle.fluid.parallel_executor import ParallelExecutor
 
 from .runtime_base import RuntimeBase
 
@@ -241,3 +244,312 @@ class ParameterServerRuntime(RuntimeBase):
         self._communicator.stop()
         executor = fluid.Executor(fluid.CPUPlace())
         executor.close()
+
+    def _get_optimizer_status(self, op, param_name):
+        supported_opts = [
+            "sgd", "adam", "adagrad", "adamax", "momentum", "lars_momentum",
+            "rmsprop", "decayed_adagrad", "ftrl"
+        ]
+
+        reshaped_val_map = {}
+        reshaped_val_map["sgd"] = []
+        reshaped_val_map["adam"] = ["moment1_0", "moment2_0"]
+        reshaped_val_map["adagrad"] = ["moment_0"]
+        reshaped_val_map["adamax"] = ["moment_0", "inf_norm_0"]
+        reshaped_val_map["momentum"] = ["velocity_0"]
+        reshaped_val_map["lars_momentum"] = ["velocity_0"]
+        reshaped_val_map[
+            "rmsprop"] = ["momentum_0", "mean_square_0", "mean_grad_0"]
+        reshaped_val_map["decayed_adagrad"] = ["moment_0"]
+        reshaped_val_map["ftrl"] = ["squared_0", "linear_0"]
+
+        orishaped_val_map = {}
+        orishaped_val_map["adam"] = ["beta1_pow_acc_0", "beta2_pow_acc_0"]
+        orishaped_val_map["adamax"] = ["beta1_pow_acc_0"]
+
+        if op not in supported_opts:
+            raise ValueError(
+                "fleet can not support optimizer: {}, only this can be supported: {}".
+                format(op, supported_opts))
+
+        reshaped_names = [
+            param_name + "_" + val for val in reshaped_val_map[op]
+        ]
+
+        if op not in orishaped_val_map:
+            origin_names = []
+        else:
+            origin_names = [
+                param_name + "_" + val for val in orishaped_val_map[op]
+            ]
+        return reshaped_names, origin_names
+
+    def _get_optimizer_op(self, param_name):
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops
+
+        opts = _get_optimize_ops(self.origin_main_program)
+        for op in opts:
+            if "Param" in op.input_names and \
+                            "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
+                return op
+
+    def _save_dense_params(self, executor, dirname, context, main_program):
+        self._communicator.recv()
+
+        prog = Program()
+        block = prog.global_block()
+        local_vars = []
+
+        for name, var_ctx in context.items():
+            if len(var_ctx.origin_varnames()) != 1:
+                raise ValueError("Dense can not support split now.")
+
+            varname = var_ctx.origin_varnames()[0]
+            local_vars.append(varname)
+
+            optimizer = self._get_optimizer_op(varname)
+            reshaped_varnames, origin_varnames = self._get_optimizer_status(
+                optimizer.type, varname)
+
+            for var_name in [varname] + reshaped_varnames + origin_varnames:
+                var = self.origin_main_program.global_block().vars[var_name]
+                block.append_op(
+                    type='recv_save',
+                    attrs={
+                        "trainer_id": self.role_maker.worker_index(),
+                        "shape": var.shape,
+                        "slice_shapes":
+                        [",".join([str(i) for i in var.shape])],
+                        "slice_varnames": [var.name],
+                        "remote_varnames": [var.name],
+                        "is_sparse": False,
+                        "endpoints": var_ctx.split_endpoints(),
+                        "file_path": os.path.join(dirname, var.name)
+                    })
+
+        executor.run(prog)
+        return local_vars
+
+    def _save_sparse_params(self, executor, dirname, context, main_program):
+        prog = Program()
+        block = prog.global_block()
+        local_vars = []
+
+        for name, var_ctx in context.items():
+            if len(var_ctx.origin_varnames()) != 1:
+                raise ValueError("Dense can not support split now.")
+
+            varname = var_ctx.origin_varnames()[0]
+            local_vars.append(varname)
+
+            optimizer = self._get_optimizer_op(varname)
+            reshaped_varnames, origin_varnames = self._get_optimizer_status(
+                optimizer.type, varname)
+
+            var = self.origin_main_program.global_block().vars[varname]
+            slice_shapes = []
+            dims1 = ",".join([str(i) for i in var.shape[1:]])
+
+            for section in var_ctx.sections():
+                slice_shapes.append(str(section) + dims1)
+
+            block.append_op(
+                type='recv_save',
+                attrs={
+                    "trainer_id": self.role_maker.worker_index(),
+                    "shape": var.shape,
+                    "slice_shapes": slice_shapes,
+                    "slice_varnames": var_ctx.split_varnames(),
+                    "remote_varnames": var_ctx.split_varnames(),
+                    "is_sparse": True,
+                    "endpoints": var_ctx.split_endpoints(),
+                    "pserver_num": len(self.role_maker.get_pserver_endpoints()),
+                    "file_path": os.path.join(dirname, var.name)
+                })
+
+            for reshaped_varname in reshaped_varnames:
+                var = self.origin_main_program.global_block().vars[
+                    reshaped_varname]
+
+                slice_varnames = []
+                remote_varnames = []
+                for i in range(len(var_ctx.split_varnames())):
+                    slice_varnames.append("{}.block{}".format(reshaped_varname,
+                                                              i))
+                    remote_varnames.append(reshaped_varname)
+
+                block.append_op(
+                    type='recv_save',
+                    attrs={
+                        "trainer_id": self.role_maker.worker_index(),
+                        "shape": var.shape,
+                        "slice_shapes": slice_shapes,
+                        "slice_varnames": slice_varnames,
+                        "remote_varnames": remote_varnames,
+                        "is_sparse": True,
+                        "endpoints": var_ctx.split_endpoints(),
+                        "pserver_num":
+                        len(self.role_maker.get_pserver_endpoints()),
+                        "file_path": os.path.join(dirname, var.name)
+                    })
+
+            for origin_varname in origin_varnames:
+                var = self.origin_main_program.global_block().vars[
+                    origin_varname]
+
+                block.append_op(
+                    type='recv_save',
+                    attrs={
+                        "trainer_id": self.role_maker.worker_index(),
+                        "shape": var.shape,
+                        "slice_shapes":
+                        [",".join([str(i) for i in var.shape])],
+                        "slice_varnames": [origin_varname],
+                        "remote_varnames": [origin_varname],
+                        "is_sparse": False,
+                        "endpoints": var_ctx.split_endpoints()[:1],
+                        "file_path": os.path.join(dirname, var.name)
+                    })
+        executor.run(prog)
+        return context.keys()
+
+    def _save_distributed_params(self, executor, dirname, context,
+                                 main_program):
+        prog = Program()
+        block = prog.global_block()
+
+        for name, var_ctx in context.items():
+            block.append_op(
+                type='checkpoint_notify',
+                attrs={
+                    "varname": name,
+                    "is_slice": True,
+                    "slice_varnames": var_ctx.split_varnames(),
+                    "remote_varnames": var_ctx.split_varnames(),
+                    "endpoints": var_ctx.split_endpoints(),
+                    "dirname": dirname
+                })
+
+        executor.run(prog)
+        return context.keys()
+
+    def _save_distributed_persistables(self, executor, dirname, main_program):
+        dense_ctx = self.compiled_strategy.get_communicator_recv_context(
+            recv_type=1)
+
+        sparse_ctx = self.compiled_strategy.get_communicator_recv_context(
+            recv_type=2)
+
+        distributed_ctx = self.compiled_strategy.get_communicator_recv_context(
+            recv_type=3)
+
+        recv_dense_varnames = self._save_dense_params(executor, dirname,
+                                                      dense_ctx, main_program)
+
+        recv_sparse_varnames = self._save_sparse_params(
+            executor, dirname, sparse_ctx, main_program)
+
+        recv_distributed_varnames = self._save_distributed_params(
+            executor, dirname, distributed_ctx, main_program)
+
+        saved_varnames = recv_dense_varnames + list(
+            recv_sparse_varnames) + list(recv_distributed_varnames)
+
+        remaining_vars = list(
+            filter(
+                ParameterServerRuntime.__exclude_vars(saved_varnames),
+                main_program.list_vars()))
+
+        fluid.io.save_vars(
+            executor,
+            main_program=main_program,
+            dirname=dirname,
+            vars=remaining_vars)
+
+    def _ps_inference_save_persistables(self,
+                                        executor,
+                                        dirname,
+                                        main_program=None,
+                                        **kwargs):
+        """
+        This function filters out all variables with `persistable==True` from the
+        give `main_program` and then saves these variables to the folder `dirname`
+        or file `filename`.
+
+        The `dirname` is used to specify the folder where persistable variables
+        are going to be saved. If you would like to save variables in separate
+        files, set `filename` None; if you would like to save all variables in a
+        single file, use `filename` to specify the file name.
+        """
+
+        if isinstance(executor, ParallelExecutor):
+            raise TypeError(
+                "in fleet.save_persistables() function, executor must be as Executor type, ParallelExecutor is not allowed"
+            )
+
+        if not isinstance(executor, Executor):
+            raise TypeError(
+                "in fleet.save_persistables() function, executor must be as Executor type"
+            )
+
+        if main_program is None:
+            main_program = fluid.default_main_program()
+
+        if isinstance(main_program, CompiledProgram):
+            raise TypeError(
+                "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
+            )
+
+        self._save_distributed_persistables(executor, dirname, main_program)
+
+    def _ps_inference_save_inference_model(self,
+                                           executor,
+                                           dirname,
+                                           feeded_var_names,
+                                           target_vars,
+                                           main_program=None,
+                                           export_for_deployment=True):
+        """
+        Prune the given `main_program` to build a new program especially for inference,
+        and then save it and all related parameters to given `dirname` by the `executor`.
+        """
+
+        if isinstance(executor, ParallelExecutor):
+            raise TypeError(
+                "in fleet.save_inference_model() function, executor must be as Executor type, ParallelExecutor is not allowed"
+            )
+
+        if not isinstance(executor, Executor):
+            raise TypeError(
+                "in fleet.save_inference_model() function, executor must be as Executor type"
+            )
+
+        if main_program is not None:
+            if isinstance(main_program, CompiledProgram):
+                raise TypeError(
+                    "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed"
+                )
+            fluid.io.save_inference_model(dirname, feeded_var_names,
+                                          target_vars, executor, main_program,
+                                          None, None, export_for_deployment)
+        else:
+            fluid.io.save_inference_model(dirname, feeded_var_names,
+                                          target_vars, executor,
+                                          self.origin_main_program, None, None,
+                                          export_for_deployment, True)
+
+            model_basename = "__model__"
+            model_filename = os.path.join(dirname, model_basename)
+
+            with open(model_filename, "rb") as f:
+                program_desc_str = f.read()
+
+            program = Program.parse_from_string(program_desc_str)
+            program._copy_dist_param_info_from(fluid.default_main_program())
+            self._ps_inference_save_persistables(executor, dirname, program)
+
+    def _save_inference_model(self, *args, **kwargs):
+        self._ps_inference_save_inference_model(*args, **kwargs)
+
+    def _save_persistables(self, *args, **kwargs):
+        self._ps_inference_save_persistables(*args, **kwargs)
diff --git a/python/paddle/distributed/fleet/runtime/runtime_base.py b/python/paddle/distributed/fleet/runtime/runtime_base.py
index 38f9f882cb4876f60fa2fe1a660f2ac899c73405..2e8bacfbc3b1ded58e63e8d9e93764a0c0090b91 100644
--- a/python/paddle/distributed/fleet/runtime/runtime_base.py
+++ b/python/paddle/distributed/fleet/runtime/runtime_base.py
@@ -33,3 +33,9 @@ class RuntimeBase(object):
 
     def _stop_worker(self):
         pass
+
+    def _save_inference_model(self, *args, **kwargs):
+        pass
+
+    def _save_persistables(self, *args, **kwargs):
+        pass
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index ecd1cf0ca7bef6586e4833ce80c48eb08a6ad2ee..e2ab321f9aebddd437c92ded9e6005495f760096 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -44,11 +44,9 @@ import time
 import six
 import copy
 from argparse import ArgumentParser, REMAINDER
-import paddle
-import paddle.fluid as fluid
 
 from paddle.distributed.utils import *
-import paddle.distributed.cloud_utils as cloud_utils
+from paddle.distributed import cloud_utils
 
 
 def _print_arguments(args):
@@ -167,7 +165,8 @@ def get_cluster_from_args(args, selected_gpus):
 
 def get_gpus(selected_gpus):
     if selected_gpus is None:
-        gpus_num = fluid.core.get_cuda_device_count()
+        from paddle.fluid import core
+        gpus_num = core.get_cuda_device_count()
         selected_gpus = [str(x) for x in range(0, gpus_num)]
     else:
         cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
@@ -190,7 +189,7 @@ def get_gpus(selected_gpus):
     return selected_gpus
 
 
-def launch(args):
+def get_cluster_and_pod(args):
     # parse arguments, used for cloud-single-machine and local
     selected_gpus = get_gpus(args.selected_gpus)
     trainers_num = cloud_utils.get_trainers_num()
@@ -209,6 +208,12 @@ def launch(args):
         cluster, pod = get_cluster_from_args(args, selected_gpus)
         logger.info("get cluster from args:{}".format(cluster))
 
+    return cluster, pod
+
+
+def launch(args):
+    cluster, pod = get_cluster_and_pod(args)
+
     procs = start_local_trainers(
         cluster,
         pod,
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c806747217add5022b0c6ea66e184b44ef56836
--- /dev/null
+++ b/python/paddle/distributed/parallel.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except jin compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import six
+import warnings
+
+from paddle import compat as cpt
+
+# deprecated module import
+from paddle.fluid import core
+from paddle.fluid.framework import _set_expected_place
+from paddle.fluid.dygraph import parallel_helper
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+__all__ = ["init_parallel_env"]
+
+ParallelStrategy = core.ParallelStrategy
+
+
+def init_parallel_env(backend='nccl'):
+    """
+    Initialize parallel training environments in dynamic mode.
+
+    Args:
+        backend(str, optional): The backend to communication between multiple devices.
+            Now only support ``nccl`` . Default value is ``nccl`` .
+
+    Returns:
+        None
+        
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
+            import paddle.distributed as dist
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+                    
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train():
+                # 1. enable dynamic mode
+                paddle.disable_static()
+                
+                # 2. initialize parallel environment
+                dist.init_parallel_env()
+
+                # 3. create data parallel layer & optimizer
+                layer = LinearNet()
+                dp_layer = paddle.DataParallel(layer)
+
+                loss_fn = nn.MSELoss()
+                adam = opt.Adam(
+                    learning_rate=0.001, parameters=dp_layer.parameters())
+
+                # 4. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+                
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            if __name__ == '__main__':
+                dist.spawn(train)
+    """
+
+    # 1. input check
+    if not isinstance(backend, six.string_types):
+        raise TypeError("input `backend` type error, expected type is str, "
+                        "but received type is %s." % type(backend))
+    if cpt.to_text(backend) != 'nccl':
+        raise ValueError(
+            "backend `%s` is not supported, now only supports `nccl` backend." %
+            backend)
+
+    # 2. check env
+    def _check_var_exists(var_name):
+        var = os.environ.get(var_name, None)
+        if var is None:
+            raise ValueError("paddle.distributed initialize error, "
+                             "environment variable %s is needed, but not set." %
+                             var_name)
+
+    _check_var_exists("FLAGS_selected_gpus")
+    _check_var_exists("PADDLE_TRAINER_ID")
+    _check_var_exists("PADDLE_CURRENT_ENDPOINT")
+    _check_var_exists("PADDLE_TRAINERS_NUM")
+    _check_var_exists("PADDLE_TRAINER_ENDPOINTS")
+
+    # 3. init ParallelStrategy
+    strategy = ParallelStrategy()
+    if cpt.to_text(backend) == 'nccl':
+        if parallel_helper._is_parallel_ctx_initialized():
+            warnings.warn("The parallel environment has been initialized.")
+        strategy.nranks = ParallelEnv().world_size
+        strategy.local_rank = ParallelEnv().rank
+        strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+        strategy.current_endpoint = ParallelEnv().current_endpoint
+        if strategy.nranks < 2:
+            return
+        # NOTE(chenweihang): [ why config global place here? ]
+        # the dygraph mode will be set to default mode, 
+        # users will not call `dygraph.guard` or `enable_dygraph`
+        # directly, if they want to switch default place,
+        # they need to call a function to change default place,
+        # here just set correctly place to users
+        place = core.CUDAPlace(ParallelEnv().device_id)
+        _set_expected_place(place)
+
+        # init nccl context
+        parallel_helper._set_parallel_ctx(
+            core.NCCLParallelContext(strategy, place))
+        parallel_helper._init_parallel_ctx()
+
+
+def get_rank():
+    """
+    Returns the rank of current trainer.
+
+    Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . 
+    The default value is 0.
+
+    Returns:
+        (int) The rank of current trainer.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.distributed as dist
+
+            # execute this command in terminal: export PADDLE_TRAINER_ID=0
+            print("The rank is %d" % dist.get_rank())
+            # The rank is 0
+    """
+    return ParallelEnv().rank
+
+
+def get_world_size():
+    """
+    The number of trainers (number of processes participating in current job).
+
+    Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . 
+    The default value is 1.
+
+    Returns:
+        (int) The number of trainers.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.distributed as dist
+
+            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
+            print("The world_size is %d" % dist.get_world_size())
+            # The world_size is 4
+    """
+    return ParallelEnv().world_size
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ca2ebaa8d4bd3e0f11e41cdcc35ab585a70b802
--- /dev/null
+++ b/python/paddle/distributed/spawn.py
@@ -0,0 +1,415 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division
+
+import multiprocessing
+import os
+import signal
+import six
+import sys
+import warnings
+
+from paddle.distributed.launch import get_cluster_and_pod, _print_arguments
+from paddle.distributed.utils import _prepare_trainer_env
+from paddle.device import get_device
+
+# deprecated module import
+from paddle.fluid import core
+from paddle.fluid.framework import _cpu_num
+
+
+# NOTE(chenweihang): The existence of this class leads to 
+# the maintenance of two arguments. When the launch.py arguments 
+# is updated, the arguments here also need to be updated, 
+# but I have not thought of a better way here
+class ParallelEnvArgs(object):
+    def __init__(self):
+        # Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..
+        self.cluster_node_ips = None
+
+        # The current node ip.
+        self.node_ip = None
+
+        # whether to use paddlecloud platform to run your multi-process job.
+        # If false, no need to set this argument.
+        self.use_paddlecloud = None
+
+        # The trainer's started port on a single node
+        self.started_port = None
+
+        # Print the config or not
+        self.print_config = True
+
+        # It's for gpu training and the training process will run 
+        # on the selected_gpus, each process is bound to a single GPU. 
+        # And if it's not set, this module will use all the gpu cards 
+        # for training.
+        self.selected_gpus = None
+
+
+def _py_supported_check():
+    if not sys.version_info >= (3, 4):
+        raise RuntimeError(
+            "Use `paddle.distributed.spawn` to start parallel training "
+            "requires python version greater than 3.4, if your python "
+            "is lower than this version, please use "
+            "`paddle.distributed.launch` instead.")
+
+
+def _get_subprocess_env_list(nprocs, options):
+    # contruct processes env list
+    processes_env_list = []
+
+    # get args from kwargs
+    args = ParallelEnvArgs()
+
+    # set default `node_ip` and `cluster_node_ips`
+    args.cluster_node_ips = options.get('cluster_node_ips', None)
+    args.node_ip = options.get('node_ip', None)
+    if args.cluster_node_ips is not None and args.node_ip is None:
+        raise ValueError("please input current node ip, "
+                         "cannot only give `cluster_node_ips`.")
+    default_node_ip = "127.0.0.1"
+    if args.node_ip is None:
+        args.node_ip = default_node_ip
+    if args.cluster_node_ips is None:
+        args.cluster_node_ips = default_node_ip
+
+    # set default selected gpus
+    # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3"
+    # NOTE(chenweihang): [ why not use FLAGS_selected_gpus directly? ]
+    # because the FLAGS_selected_gpus may be used in other place,
+    # if we set FLAGS_selected_gpus to be `0,1,2,3`, it may cause error
+    # when using `ParallelEnv`
+    # NOTE(chenweihang): use absolute gpu card id
+    args.selected_gpus = options.get('selected_gpus', None)
+    env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
+    if env_devices is None or env_devices == "":
+        env_devices_list = [
+            str(x) for x in six.moves.range(core.get_cuda_device_count())
+        ]
+    else:
+        env_devices_list = env_devices.split(',')
+    if args.selected_gpus is None:
+        if len(env_devices_list) < nprocs:
+            raise RuntimeError(
+                "the number of visible devices(%d) is less than the number "
+                "of spawn processes(%d), please ensure that the correct "
+                "`nprocs` argument is passed or the environment variable "
+                "`CUDA_VISIBLE_DEVICES` is correctly configured." %
+                (len(env_devices_list), nprocs))
+        args.selected_gpus = ",".join(
+            [str(env_devices_list[x]) for x in range(0, nprocs)])
+    else:
+        for card_id in args.selected_gpus.split(','):
+            if card_id not in env_devices_list:
+                raise ValueError("The selected gpu card %s cannot found in "
+                                 "CUDA_VISIBLE_DEVICES (%s)." %
+                                 (card_id, ",".join(env_devices_list)))
+
+    # set other arguments
+    args.started_port = options.get('started_port', None)
+    args.use_paddlecloud = options.get('use_paddlecloud', False)
+    args.print_config = options.get('print_config', False)
+
+    # reuse code of launch.py
+    cluster, pod = get_cluster_and_pod(args)
+
+    # prepare subprocess env list
+    for trainer in pod.trainers:
+        processes_env_list.append(_prepare_trainer_env(cluster, trainer))
+
+    # print config
+    if args.print_config:
+        _print_arguments(args)
+
+    return processes_env_list
+
+
+def _remove_risky_env():
+    # remove useless env vars, same as launch.py
+    # no copy, each process will hold env vars itself
+    os.environ.pop("http_proxy", None)
+    os.environ.pop("https_proxy", None)
+
+
+def _set_trainer_env(env_dict):
+    for var_name in env_dict:
+        os.environ[var_name] = env_dict[var_name]
+
+
+def _func_wrapper(func, args, error_queue, return_queue, env_dict):
+    try:
+        # config subprocess environment variables
+        _remove_risky_env()
+        _set_trainer_env(env_dict)
+        # execute function
+        result = func(*args)
+        # record function return value
+        return_queue.put(result)
+    except KeyboardInterrupt:
+        pass
+    except Exception:
+        import traceback
+        error_queue.put(traceback.format_exc())
+        sys.exit(1)
+
+
+class MultiprocessContext(object):
+    def __init__(self, processes, error_queues, return_queues):
+        _py_supported_check()
+        self.error_queues = error_queues
+        # NOTE(chenweihang): The `spawn` method is mainly used 
+        # to wrap the outermost execution function of the program for 
+        # parallel execution. Generally, the return value is not concerned, 
+        # but if the user needs to obtain the return value, users can get  
+        # the return result of each process from context.return_queues
+        self.return_queues = return_queues
+        self.processes = processes
+        self.sentinels = {
+            process.sentinel: index
+            for index, process in enumerate(processes)
+        }
+
+    def join(self, timeout=None):
+        if len(self.sentinels) == 0:
+            return True
+
+        ready = multiprocessing.connection.wait(
+            self.sentinels.keys(), timeout=timeout)
+
+        error_index = None
+        for sentinel in ready:
+            index = self.sentinels.pop(sentinel)
+            process = self.processes[index]
+            process.join()
+            if process.exitcode != 0:
+                error_index = index
+                break
+
+        if error_index is None:
+            return len(self.sentinels) == 0
+
+        for process in self.processes:
+            if process.is_alive():
+                process.terminate()
+            process.join()
+
+        self._throw_exception(error_index)
+
+    def _throw_exception(self, error_index):
+        if self.error_queues[error_index].empty():
+            exitcode = self.processes[error_index].exitcode
+            if exitcode < 0:
+                name = signal.Signals(-exitcode).name
+                raise Exception("Process %d terminated with signal %s." %
+                                (error_index, name))
+            else:
+                raise Exception("Process %d terminated with exit code %d." & (
+                    error_index, exitcode))
+
+        original_trace = self.error_queues[error_index].get()
+        msg = "\n\n----------------------------------------------\n" \
+              "Process %d terminated with the following error:\n" \
+              "----------------------------------------------\n\n" % error_index
+        msg += original_trace
+        raise Exception(msg)
+
+
+def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
+    """
+    Start multiple processes with ``spawn`` method for parallel training.
+
+    Args:
+        func (function): The target function is called by spawned process.
+            This function need to be able to pickled, so it must be defined
+            at the top level of a module.
+            This function should be called as ``func(i, *args)``, ``i`` is
+            the process index and ``args`` contains other arguments as tuple.
+        args (tuple, optional): Arguments passed to ``func``.
+        nprocs (int, optional): Number of processed to start. Default: -1.
+            when nprocs is -1, the available device will be obtained from 
+            the environment variable when the model is executed: If use GPU, 
+            the currently available device ID is obtained from the environment 
+            variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available
+            CPU number is obtained from the environment variable CPU_NUM. 
+            For example, export CPU_NUM=4, if the environment variable is not set, 
+            the executor will add the variable to the environment variable and 
+            set its value to 1.
+        join (bool, optional): Perform a blocking join on all spawned processes.
+            Default: True.
+        daemon (bool, optional): The spawned processes' daemon flag. Default: False.
+        **options(dict, optional): Other initial parallel execution environment 
+            configuration options. The following options are currently supported: 
+            (1) start_method (string): the way to start a process. 
+            The start method can be ``spawn`` , ``fork`` , ``forkserver`` . 
+            Because the CUDA runtime does not support the ``fork`` start method, 
+            when use CUDA in subprocesses, we should start process by ``spawn`` 
+            or ``forkserver`` method. Default: "spawn" ; 
+            (2) cluster_node_ips (string): Paddle cluster nodes ips, such as 
+            "192.168.0.16,192.168.0.17". Default: "127.0.0.1"; 
+            (3) node_ip (string): The current node ip, such as "192.168.0.16". 
+            Default: "127.0.0.1"; 
+            (4) started_port (int): The trainer's started port on a single node,
+            such as 6170. Default: None; 
+            (5) selected_gpus (string): The training process will run on the 
+            selected_gpus, such as "0,1,2,3". Default: None; 
+            (6) print_config: Print current parallel training config. Default: False;
+            (7) use_paddlecloud: Whether to use paddlecloud platform to run your 
+            multi-process job. Default: False.
+
+    Returns:
+        ``MultiprocessContext`` object, it hold the spawned processes.
+
+    Examples:
+        .. code-block:: python
+
+            from __future__ import print_function
+
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
+            import paddle.distributed as dist
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+                    
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train(print_result=False):
+                # 1. enable dynamic mode
+                paddle.disable_static()
+                
+                # 2. initialize parallel environment
+                dist.init_parallel_env()
+
+                # 3. create data parallel layer & optimizer
+                layer = LinearNet()
+                dp_layer = paddle.DataParallel(layer)
+
+                loss_fn = nn.MSELoss()
+                adam = opt.Adam(
+                    learning_rate=0.001, parameters=dp_layer.parameters())
+
+                # 4. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+                
+                if print_result is True:
+                    print("loss:", loss.numpy())
+                
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            # Usage 1: only pass function. 
+            # If your training method no need any argument, and 
+            # use all visible devices for parallel training. 
+            if __name__ == '__main__':
+                dist.spawn(train)
+
+            # Usage 2: pass function and arguments.
+            # If your training method need some arguments, and 
+            # use all visible devices for parallel training.
+            if __name__ == '__main__':
+                dist.spawn(train, args=(True,))
+
+            # Usage 3: pass function, arguments and nprocs.
+            # If your training method need some arguments, and 
+            # only use part of visible devices for parallel training.
+            # If your machine hold 8 cards {0,1,2,3,4,5,6,7},
+            # this case will use cards {0,1}; If you set 
+            # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use
+            # cards {4,5}
+            if __name__ == '__main__':
+                dist.spawn(train, args=(True,), nprocs=2)
+
+            # Usage 4: pass function, arguments, nprocs and selected_gpus.
+            # If your training method need some arguments, and 
+            # only use part of visible devices for parallel training,
+            # but you can't set your machine's environment varibale 
+            # CUDA_VISIBLE_DEVICES, such as it is None or all cards
+            # {0,1,2,3,4,5,6,7}, you can pass `selelcted_gpus` to 
+            # select the GPU cards you want to use. For example,
+            # this case will use cards {4,5} if your machine hold 8 cards.
+            if __name__ == '__main__':
+                dist.spawn(train, args=(True,), nprocs=2, selelcted_gpus='4,5')
+    """
+    # NOTE(chenweihang): [ why only supports python3.4+ ? ]
+    # Python supported setting the child process startup method
+    # since 3.4. The previous version can only use the default startup 
+    # method, while the default startup method of Unix is fork, which 
+    # cannot support CUDA runtime multi-process
+    _py_supported_check()
+
+    # get default nprocs
+    if nprocs == -1:
+        device = get_device()
+        if device == 'cpu':
+            # TODO: not supports cpu parallel now
+            nprocs = _cpu_num
+        else:
+            nprocs = core.get_cuda_device_count()
+
+    # NOTE(chenweihang): [ why need get cluster info before run? ]
+    # when using `paddle.distributed.spawn` start parallel training, 
+    # we should get cluster info before starting subprocess, and pass 
+    # correct info to each subprocess
+    procs_env_list = _get_subprocess_env_list(nprocs, options)
+
+    # start processes
+    # NOTE(chenweihang): [ why default start method is spawn? ]
+    # The CUDA runtime does not support the fork start method, 
+    # either the spawn or forkserver start method are required 
+    # to use CUDA in subprocesses.
+    start_method = options.get('start_method', None)
+    if start_method is None:
+        start_method = 'spawn'
+    mp = multiprocessing.get_context(start_method)
+
+    error_queues = []
+    return_queues = []
+    processes = []
+    for i in range(nprocs):
+        error_queue = mp.SimpleQueue()
+        return_queue = mp.SimpleQueue()
+        process = mp.Process(
+            target=_func_wrapper,
+            args=(func, args, error_queue, return_queue, procs_env_list[i]))
+        process.daemon = daemon
+        process.start()
+        error_queues.append(error_queue)
+        return_queues.append(return_queue)
+        processes.append(process)
+
+    context = MultiprocessContext(processes, error_queues, return_queues)
+    if not join:
+        return context
+
+    # loop until all process end
+    while not context.join():
+        pass
+
+    # finally return context
+    return context
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 7c8fa257f778e71cab35054c3f9d63faaa33de47..1fa307c4d1b89d4033a8f8346b254177053e9dc0 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -327,6 +327,17 @@ def find_free_ports(num):
     return None
 
 
+def _prepare_trainer_env(cluster, trainer):
+    proc_env = {
+        "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]),
+        "PADDLE_TRAINER_ID": "%d" % trainer.rank,
+        "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
+        "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+        "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+    }
+    return proc_env
+
+
 class TrainerProc(object):
     def __init__(self):
         self.proc = None
@@ -352,14 +363,7 @@ def start_local_trainers(cluster,
 
     procs = []
     for idx, t in enumerate(pod.trainers):
-        proc_env = {
-            "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]),
-            "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
-            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
-        }
-
+        proc_env = _prepare_trainer_env(cluster, t)
         current_env.update(proc_env)
 
         logger.debug("trainer proc env:{}".format(current_env))
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index fff10c5b2a9ee497cccff94346314db2c8011eb5..49e98805d24f3f8f5dc1cfcbf3ddc8d9fb835fde 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -18,3 +18,540 @@
 #            'Normal',
 #            'sampling_id',
 #            'Uniform']
+
+from __future__ import print_function
+
+from .fluid.layers import control_flow
+from .fluid.layers import tensor
+from .fluid.layers import ops
+from .fluid.layers import nn
+from .fluid import core
+from .fluid.framework import in_dygraph_mode
+from .tensor.math import elementwise_mul, elementwise_div, elementwise_add, elementwise_sub
+import math
+import numpy as np
+import warnings
+
+from .fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+
+__all__ = ['Distribution', 'Uniform', 'Normal']
+
+
+class Distribution(object):
+    """
+    The abstract base class for probability distributions. Functions are 
+    implemented in specific distributions.
+    """
+
+    def __init__(self):
+        super(Distribution, self).__init__()
+
+    def sample(self):
+        """Sampling from the distribution."""
+        raise NotImplementedError
+
+    def entropy(self):
+        """The entropy of the distribution."""
+        raise NotImplementedError
+
+    def kl_divergence(self, other):
+        """The KL-divergence between self distributions and other."""
+        raise NotImplementedError
+
+    def log_prob(self, value):
+        """Log probability density/mass function."""
+        raise NotImplementedError
+
+    def probs(self, value):
+        """Probability density/mass function."""
+        raise NotImplementedError
+
+    def _validate_args(self, *args):
+        """
+        Argument validation for distribution args
+        Args:
+            value (float, list, numpy.ndarray, Tensor)
+        Raises
+            ValueError: if one argument is Tensor, all arguments should be Tensor
+        """
+        is_variable = False
+        is_number = False
+        for arg in args:
+            if isinstance(arg, tensor.Variable):
+                is_variable = True
+            else:
+                is_number = True
+
+        if is_variable and is_number:
+            raise ValueError(
+                'if one argument is Tensor, all arguments should be Tensor')
+
+        return is_variable
+
+    def _to_tensor(self, *args):
+        """
+        Argument convert args to Tensor
+
+        Args:
+            value (float, list, numpy.ndarray, Tensor)
+        Returns:
+            Tensor of args.
+        """
+        numpy_args = []
+        variable_args = []
+        tmp = 0.
+
+        for arg in args:
+            valid_arg = False
+            for cls in [float, list, np.ndarray, tensor.Variable]:
+                if isinstance(arg, cls):
+                    valid_arg = True
+                    break
+            assert valid_arg, "type of input args must be float, list, numpy.ndarray or Tensor."
+            if isinstance(arg, float):
+                arg = np.zeros(1) + arg
+            arg_np = np.array(arg)
+            arg_dtype = arg_np.dtype
+            if str(arg_dtype) not in ['float32']:
+                warnings.warn(
+                    "data type of argument only support float32, your argument will be convert to float32."
+                )
+                arg_np = arg_np.astype('float32')
+            tmp = tmp + arg_np
+            numpy_args.append(arg_np)
+
+        dtype = tmp.dtype
+        for arg in numpy_args:
+            arg_broadcasted, _ = np.broadcast_arrays(arg, tmp)
+            arg_variable = tensor.create_tensor(dtype=dtype)
+            tensor.assign(arg_broadcasted, arg_variable)
+            variable_args.append(arg_variable)
+
+        return tuple(variable_args)
+
+
+class Uniform(Distribution):
+    """Uniform distribution with `low` and `high` parameters.
+
+    Mathematical Details
+
+    The probability density function (pdf) is
+
+    .. math::
+
+        pdf(x; a, b) = \\frac{1}{Z}, \ a <=x <b
+
+    .. math::
+
+        Z = b - a
+
+    In the above equation:
+
+    * :math:`low = a`,
+    * :math:`high = b`,
+    * :math:`Z`: is the normalizing constant.
+
+    The parameters `low` and `high` must be shaped in a way that supports
+    [broadcasting](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/beginners_guide/basic_concept/broadcasting_en.html) (e.g., `high - low` is a valid operation).
+
+    Args:
+        low(int|float|list|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float32, list, numpy.ndarray or Tensor
+        high(int|float|list|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float32, list, numpy.ndarray or Tensor
+        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle
+          from paddle.distribution import Uniform
+
+          paddle.disable_static()
+          # Without broadcasting, a single uniform distribution [3, 4]:
+          u1 = Uniform(low=3.0, high=4.0)
+          # 2 distributions [1, 3], [2, 4]
+          u2 = Uniform(low=[1.0, 2.0], high=[3.0, 4.0])
+          # 4 distributions
+          u3 = Uniform(low=[[1.0, 2.0], [3.0, 4.0]],
+                    high=[[1.5, 2.5], [3.5, 4.5]])
+
+          # With broadcasting:
+          u4 = Uniform(low=3.0, high=[5.0, 6.0, 7.0])
+
+          # Complete example
+          value_npdata = np.array([0.8], dtype="float32")
+          value_tensor = paddle.to_tensor(value_npdata)
+
+          uniform = Uniform([0.], [2.])
+
+          sample = uniform.sample([2])
+          # a random tensor created by uniform distribution with shape: [2, 1]
+          entropy = uniform.entropy()
+          # [0.6931472] with shape: [1]
+          lp = uniform.log_prob(value_tensor)
+          # [-0.6931472] with shape: [1]
+          p = uniform.probs(value_tensor)
+          # [0.5] with shape: [1]
+    """
+
+    def __init__(self, low, high, name=None):
+        if not in_dygraph_mode():
+            check_type(low, 'low',
+                       (int, float, np.ndarray, tensor.Variable, list),
+                       'Uniform')
+            check_type(high, 'high',
+                       (int, float, np.ndarray, tensor.Variable, list),
+                       'Uniform')
+
+        self.all_arg_is_float = False
+        self.batch_size_unknown = False
+        self.name = name if name is not None else 'Uniform'
+
+        if isinstance(low, int):
+            low = float(low)
+        if isinstance(high, int):
+            high = float(high)
+
+        if self._validate_args(low, high):
+            self.batch_size_unknown = True
+            self.low = low
+            self.high = high
+        else:
+            if isinstance(low, float) and isinstance(high, float):
+                self.all_arg_is_float = True
+            self.low, self.high = self._to_tensor(low, high)
+
+    def sample(self, shape, seed=0):
+        """Generate samples of the specified shape.
+
+        Args:
+          shape (list): 1D `int32`. Shape of the generated samples.
+          seed (int): Python integer number.
+
+        Returns:
+          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+
+        """
+        if not in_dygraph_mode():
+            check_type(shape, 'shape', (list), 'sample')
+            check_type(seed, 'seed', (int), 'sample')
+
+        name = self.name + '_sample'
+        batch_shape = list((self.low + self.high).shape)
+        if self.batch_size_unknown:
+            output_shape = shape + batch_shape
+            zero_tmp = tensor.fill_constant_batch_size_like(
+                self.low + self.high, batch_shape + shape, self.low.dtype, 0.)
+            uniform_random_tmp = nn.uniform_random_batch_size_like(
+                zero_tmp,
+                zero_tmp.shape,
+                dtype=convert_dtype(zero_tmp.dtype),
+                min=0.,
+                max=1.,
+                seed=seed)
+            zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
+            uniform_random_tmp_reshape = nn.reshape(uniform_random_tmp,
+                                                    output_shape)
+            output = uniform_random_tmp_reshape * (
+                zero_tmp_reshape + self.high - self.low)
+            output = elementwise_add(output, self.low, name=name)
+            return output
+        else:
+            output_shape = shape + batch_shape
+            output = nn.uniform_random(
+                output_shape, seed=seed) * (tensor.zeros(
+                    output_shape, dtype=self.low.dtype) +
+                                            (self.high - self.low))
+            output = elementwise_add(output, self.low, name=name)
+            if self.all_arg_is_float:
+                return nn.reshape(output, shape, name=name)
+            else:
+                return output
+
+    def log_prob(self, value):
+        """Log probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: log probability.The data type is same with value.
+
+        """
+        name = self.name + '_log_prob'
+        if in_dygraph_mode():
+            lb_bool = self.low < value
+            ub_bool = value < self.high
+
+            dtype = value.dtype
+            lb = core.ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
+                               dtype)
+            ub = core.ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
+                               dtype)
+            return nn.log(lb * ub) - nn.log(self.high - self.low)
+
+        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                 'log_prob')
+
+        lb_bool = control_flow.less_than(self.low, value)
+        ub_bool = control_flow.less_than(value, self.high)
+        lb = tensor.cast(lb_bool, dtype=value.dtype)
+        ub = tensor.cast(ub_bool, dtype=value.dtype)
+        return elementwise_sub(
+            nn.log(lb * ub), nn.log(self.high - self.low), name=name)
+
+    def probs(self, value):
+        """Probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: probability.The data type is same with value.
+
+        """
+        name = self.name + '_probs'
+        if in_dygraph_mode():
+            lb_bool = self.low < value
+            ub_bool = value < self.high
+
+            dtype = value.dtype
+            lb = core.ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
+                               dtype)
+            ub = core.ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
+                               dtype)
+            return (lb * ub) / (self.high - self.low)
+
+        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                 'log_prob')
+
+        lb_bool = control_flow.less_than(self.low, value)
+        ub_bool = control_flow.less_than(value, self.high)
+        lb = tensor.cast(lb_bool, dtype=value.dtype)
+        ub = tensor.cast(ub_bool, dtype=value.dtype)
+        return elementwise_div((lb * ub), (self.high - self.low), name=name)
+
+    def entropy(self):
+        """Shannon entropy in nats.
+
+        Returns:
+          Tensor: Shannon entropy of uniform distribution.The data type is float32.
+
+        """
+        name = self.name + '_entropy'
+        return nn.log(self.high - self.low, name=name)
+
+
+class Normal(Distribution):
+    """The Normal distribution with location `loc` and `scale` parameters.
+
+    Mathematical details
+
+    The probability density function (pdf) is
+
+    .. math::
+
+        pdf(x; \mu, \sigma) = \\frac{1}{Z}e^{\\frac {-0.5 (x - \mu)^2}  {\sigma^2} }
+
+    .. math::
+
+        Z = (2 \pi \sigma^2)^{0.5}
+
+    In the above equation:
+
+    * :math:`loc = \mu`: is the mean.
+    * :math:`scale = \sigma`: is the std.
+    * :math:`Z`: is the normalization constant.
+
+    Args:
+        loc(int|float|list|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float32, list, numpy.ndarray or Tensor.
+        scale(int|float|list|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float32, list, numpy.ndarray or Tensor.
+        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+        .. code-block:: python
+          
+          import numpy as np
+          import paddle
+          from paddle.distribution import Normal
+
+          paddle.disable_static()
+          # Define a single scalar Normal distribution.
+          dist = Normal(loc=0., scale=3.)
+          # Define a batch of two scalar valued Normals.
+          # The first has mean 1 and standard deviation 11, the second 2 and 22.
+          dist = Normal(loc=[1., 2.], scale=[11., 22.])
+          # Get 3 samples, returning a 3 x 2 tensor.
+          dist.sample([3])
+
+          # Define a batch of two scalar valued Normals.
+          # Both have mean 1, but different standard deviations.
+          dist = Normal(loc=1., scale=[11., 22.])
+
+          # Complete example
+          value_npdata = np.array([0.8], dtype="float32")
+          value_tensor = paddle.to_tensor(value_npdata)
+
+          normal_a = Normal([0.], [1.])
+          normal_b = Normal([0.5], [2.])
+          sample = normal_a.sample([2])
+          # a random tensor created by normal distribution with shape: [2, 1]
+          entropy = normal_a.entropy()
+          # [1.4189385] with shape: [1]
+          lp = normal_a.log_prob(value_tensor)
+          # [-1.2389386] with shape: [1]
+          p = normal_a.probs(value_tensor)
+          # [0.28969154] with shape: [1]
+          kl = normal_a.kl_divergence(normal_b)
+          # [0.34939718] with shape: [1]
+    """
+
+    def __init__(self, loc, scale, name=None):
+        if not in_dygraph_mode():
+            check_type(loc, 'loc',
+                       (int, float, np.ndarray, tensor.Variable, list),
+                       'Normal')
+            check_type(scale, 'scale',
+                       (int, float, np.ndarray, tensor.Variable, list),
+                       'Normal')
+
+        self.batch_size_unknown = False
+        self.all_arg_is_float = False
+        self.name = name if name is not None else 'Normal'
+
+        if isinstance(loc, int):
+            loc = float(loc)
+        if isinstance(scale, int):
+            scale = float(scale)
+
+        if self._validate_args(loc, scale):
+            self.batch_size_unknown = True
+            self.loc = loc
+            self.scale = scale
+        else:
+            if isinstance(loc, float) and isinstance(scale, float):
+                self.all_arg_is_float = True
+            self.loc, self.scale = self._to_tensor(loc, scale)
+
+    def sample(self, shape, seed=0):
+        """Generate samples of the specified shape.
+
+        Args:
+          shape (list): 1D `int32`. Shape of the generated samples.
+          seed (int): Python integer number.
+
+        Returns:
+          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+
+        """
+        if not in_dygraph_mode():
+            check_type(shape, 'shape', (list), 'sample')
+            check_type(seed, 'seed', (int), 'sample')
+
+        batch_shape = list((self.loc + self.scale).shape)
+        name = self.name + '_sample'
+
+        if self.batch_size_unknown:
+            output_shape = shape + batch_shape
+            zero_tmp = tensor.fill_constant_batch_size_like(
+                self.loc + self.scale, batch_shape + shape, self.loc.dtype, 0.)
+            zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
+            zero_tmp_shape = nn.shape(zero_tmp_reshape)
+            normal_random_tmp = nn.gaussian_random(
+                zero_tmp_shape,
+                mean=0.,
+                std=1.,
+                seed=seed,
+                dtype=convert_dtype(self.loc.dtype))
+            output = normal_random_tmp * (zero_tmp_reshape + self.scale)
+            output = elementwise_add(output, self.loc, name=name)
+            return output
+        else:
+            output_shape = shape + batch_shape
+            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed) * \
+                     (tensor.zeros(output_shape, dtype=self.loc.dtype) + self.scale)
+            output = elementwise_add(output, self.loc, name=name)
+            if self.all_arg_is_float:
+                return nn.reshape(output, shape, name=name)
+            else:
+                return output
+
+    def entropy(self):
+        """Shannon entropy in nats.
+
+        Returns:
+          Tensor: Shannon entropy of normal distribution.The data type is float32.
+
+        """
+        name = self.name + '_entropy'
+        batch_shape = list((self.loc + self.scale).shape)
+        zero_tmp = tensor.fill_constant_batch_size_like(
+            self.loc + self.scale, batch_shape, self.loc.dtype, 0.)
+        return elementwise_add(
+            0.5 + zero_tmp,
+            0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
+            name=name)
+
+    def log_prob(self, value):
+        """Log probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: log probability.The data type is same with value.
+
+        """
+        if not in_dygraph_mode():
+            check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                     'log_prob')
+
+        name = self.name + '_log_prob'
+        var = self.scale * self.scale
+        log_scale = nn.log(self.scale)
+        return elementwise_sub(
+            -1. * ((value - self.loc) * (value - self.loc)) / (2. * var),
+            log_scale + math.log(math.sqrt(2. * math.pi)),
+            name=name)
+
+    def probs(self, value):
+        """Probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: probability.The data type is same with value.
+
+        """
+        if not in_dygraph_mode():
+            check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                     'log_prob')
+
+        name = self.name + '_probs'
+        var = self.scale * self.scale
+        return elementwise_div(
+            ops.exp(-1. * ((value - self.loc) * (value - self.loc)) /
+                    (2. * var)), (math.sqrt(2 * math.pi) * self.scale),
+            name=name)
+
+    def kl_divergence(self, other):
+        """The KL-divergence between two normal distributions.
+
+        Args:
+            other (Normal): instance of Normal.
+
+        Returns:
+            Tensor: kl-divergence between two normal distributions.The data type is float32.
+
+        """
+        if not in_dygraph_mode():
+            check_type(other, 'other', Normal, 'kl_divergence')
+
+        name = self.name + '_kl_divergence'
+        var_ratio = self.scale / other.scale
+        var_ratio = (var_ratio * var_ratio)
+        t1 = (self.loc - other.loc) / other.scale
+        t1 = (t1 * t1)
+        return elementwise_add(
+            0.5 * var_ratio, 0.5 * (t1 - 1. - nn.log(var_ratio)), name=name)
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 7e0d8c0de5b78625bb7b131df4d40bcae1152e2e..9f748b7956f9faa6b1c948d87f0ef4659057a421 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -68,7 +68,7 @@ from .input import embedding, one_hot
 from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
-from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
+from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
 from .incubate import fleet
 from .incubate import data_generator
 from .transpiler import DistributeTranspiler, \
@@ -119,6 +119,7 @@ __all__ = framework.__all__ + executor.__all__ + \
         'LoDTensor',
         'LoDTensorArray',
         'CPUPlace',
+        'XPUPlace',
         'CUDAPlace',
         'CUDAPinnedPlace',
         'Tensor',
@@ -195,6 +196,7 @@ def __bootstrap__():
         'free_idle_chunk',
         'free_when_no_cache_hit',
         'call_stack_level',
+        'sort_sum_gradient',
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index cece2ba4a3d788ab2df4c0a6a847c9597d36047a..e3755cbafea41e61352f67c3de040e700297b61a 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -15,6 +15,7 @@
 import logging
 import numpy as np
 import sys
+import paddle
 from paddle.fluid import dygraph
 from paddle.fluid.dygraph.nn import Conv2D
 from paddle.fluid.dygraph.nn import Linear
@@ -195,13 +196,16 @@ class ImperativeQuantAware(object):
         with dygraph.guard():
             model.eval()
             input_vars = []
-            for shape, dtype in zip(input_shape, input_dtype):
-                raw_data = np.random.random(shape)
-                input_data = raw_data[np.newaxis, :].astype(
-                    dtype) if append_batch_size else raw_data.astype(dtype)
-                input_var = dygraph.to_variable(input_data)
-                input_vars.append(input_var)
-            outputs = prog_trans.get_output(model.forward, model, *input_vars)
+            for i, (shape, dtype) in enumerate(zip(input_shape, input_dtype)):
+                if append_batch_size:
+                    shape = [None] + list(shape)
+                # Note(Aurelius84): need a elegant way to name this.
+                in_spec = paddle.static.InputSpec(shape, dtype, 'feed_%d' % i)
+                input_vars.append(in_spec)
+            # use `declarative` to convert dygraph into static program
+            model.forward = dygraph.jit.declarative(
+                model.forward, input_spec=input_vars)
+            outputs = model.forward.concrete_program.outputs
         input_spec = [input_vars[i] for i in feed]
         configs = dygraph.jit.SaveLoadConfig()
         configs.separate_params = True
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 0eef94896287af833c7d8d9e2a480627c61b3004..b5a8d901943318ca039b0a73c1be39fb0734e212 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -66,6 +66,9 @@ _out_scale_op_list = [
     "concat",
     "elementwise_mul",
     "scale",
+    "hard_swish",
+    "hard_sigmoid",
+    "conv2d_transpose",
 ]
 
 # list op real input and output names, to avoid processing input such as AxisTensor.
@@ -109,6 +112,8 @@ _op_real_in_out_name = {
     "sigmoid": [["X"], ["Out"]],
     "elementwise_mul": [["X", "Y"], ["Out"]],
     "scale": [["X"], ["Out"]],
+    "hard_swish": [["X"], ["Out"]],
+    "hard_sigmoid": [["X"], ["Out"]],
 }
 
 _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index df7e585d45f445067b3a700951418c06c9062ae7..007d701284dfc7ff2cafb128984414517579fce3 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -123,6 +123,7 @@ endfunction()
 
 if(WIN32)
 	list(REMOVE_ITEM TEST_OPS test_light_nas)
+	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1)
     list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50)
     list(REMOVE_ITEM TEST_OPS test_weight_quantization_mobilenetv1)
@@ -263,6 +264,13 @@ list(REMOVE_ITEM TEST_OPS
 #TODO(wanghaoshuang): Fix this unitest failed on GCC8.
 LIST(REMOVE_ITEM TEST_OPS test_auto_pruning)
 LIST(REMOVE_ITEM TEST_OPS test_filter_pruning)
+
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+
+# setting timeout value for old unittests
+if(NOT WIN32)
+    set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250 LABELS "RUN_TYPE=NIGHTLY")
+	  set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
+endif()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ac1590b8aa6eaefbccd3907b314fb438386ffc6
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -0,0 +1,226 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import os
+import time
+import sys
+import random
+import math
+import functools
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.dataset.common import download
+from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+
+random.seed(0)
+np.random.seed(0)
+
+
+class TestPostTrainingQuantization(unittest.TestCase):
+    def setUp(self):
+        self.download_path = 'int8/download'
+        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
+                                               self.download_path)
+        self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        self.int8_model_path = os.path.join(os.getcwd(),
+                                            "post_training_" + self.timestamp)
+        try:
+            os.system("mkdir -p " + self.int8_model_path)
+        except Exception as e:
+            print("Failed to create {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+            sys.exit(-1)
+
+    def tearDown(self):
+        try:
+            os.system("rm -rf {}".format(self.int8_model_path))
+        except Exception as e:
+            print("Failed to delete {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+
+    def cache_unzipping(self, target_folder, zip_path):
+        if not os.path.exists(target_folder):
+            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
+                                                          zip_path)
+            os.system(cmd)
+
+    def download_model(self, data_url, data_md5, folder_name):
+        download(data_url, self.download_path, data_md5)
+        file_name = data_url.split('/')[-1]
+        zip_path = os.path.join(self.cache_folder, file_name)
+        print('Data is downloaded at {0}'.format(zip_path))
+
+        data_cache_folder = os.path.join(self.cache_folder, folder_name)
+        self.cache_unzipping(data_cache_folder, zip_path)
+        return data_cache_folder
+
+    def run_program(self, model_path, batch_size, infer_iterations):
+        print("test model path:" + model_path)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        [infer_program, feed_dict, fetch_targets] = \
+            fluid.io.load_inference_model(model_path, exe)
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size)
+
+        img_shape = [1, 28, 28]
+        test_info = []
+        cnt = 0
+        periods = []
+        for batch_id, data in enumerate(val_reader()):
+            image = np.array(
+                [x[0].reshape(img_shape) for x in data]).astype("float32")
+            input_label = np.array([x[1] for x in data]).astype("int64")
+
+            t1 = time.time()
+            out = exe.run(infer_program,
+                          feed={feed_dict[0]: image},
+                          fetch_list=fetch_targets)
+            t2 = time.time()
+            period = t2 - t1
+            periods.append(period)
+
+            out_label = np.argmax(np.array(out[0]), axis=1)
+            top1_num = sum(input_label == out_label)
+            test_info.append(top1_num)
+            cnt += len(data)
+
+            if (batch_id + 1) == infer_iterations:
+                break
+
+        throughput = cnt / np.sum(periods)
+        latency = np.average(periods)
+        acc1 = np.sum(test_info) / cnt
+        return (throughput, latency, acc1)
+
+    def generate_quantized_model(self,
+                                 model_path,
+                                 algo="KL",
+                                 quantizable_op_type=["conv2d"],
+                                 is_full_quantize=False,
+                                 is_use_cache_file=False,
+                                 is_optimize_model=False,
+                                 batch_size=10,
+                                 batch_nums=10):
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.global_scope()
+        val_reader = paddle.dataset.mnist.train()
+
+        ptq = PostTrainingQuantization(
+            executor=exe,
+            model_dir=model_path,
+            sample_generator=val_reader,
+            batch_size=batch_size,
+            batch_nums=batch_nums,
+            algo=algo,
+            quantizable_op_type=quantizable_op_type,
+            is_full_quantize=is_full_quantize,
+            optimize_model=is_optimize_model,
+            is_use_cache_file=is_use_cache_file)
+        ptq.quantize()
+        ptq.save_quantized_model(self.int8_model_path)
+
+    def run_test(self,
+                 model_name,
+                 data_url,
+                 data_md5,
+                 algo,
+                 quantizable_op_type,
+                 is_full_quantize,
+                 is_use_cache_file,
+                 is_optimize_model,
+                 diff_threshold,
+                 batch_size=10,
+                 infer_iterations=10,
+                 quant_iterations=5):
+
+        origin_model_path = self.download_model(data_url, data_md5, model_name)
+        origin_model_path = os.path.join(origin_model_path, model_name)
+
+        print("Start FP32 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
+            origin_model_path, batch_size, infer_iterations)
+
+        print("Start INT8 post training quantization for {0} on {1} images ...".
+              format(model_name, quant_iterations * batch_size))
+        self.generate_quantized_model(
+            origin_model_path, algo, quantizable_op_type, is_full_quantize,
+            is_use_cache_file, is_optimize_model, batch_size, quant_iterations)
+
+        print("Start INT8 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
+            self.int8_model_path, batch_size, infer_iterations)
+
+        print("---Post training quantization of {} method---".format(algo))
+        print(
+            "FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.".
+            format(model_name, batch_size, fp32_throughput, fp32_latency,
+                   fp32_acc1))
+        print(
+            "INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.\n".
+            format(model_name, batch_size, int8_throughput, int8_latency,
+                   int8_acc1))
+        sys.stdout.flush()
+
+        delta_value = fp32_acc1 - int8_acc1
+        self.assertLess(delta_value, diff_threshold)
+
+
+class TestPostTrainingKLForMnist(TestPostTrainingQuantization):
+    def test_post_training_kl(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "KL"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingAbsMaxForMnist(TestPostTrainingQuantization):
+    def test_post_training_abs_max(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "abs_max"
+        quantizable_op_type = ["conv2d", "mul"]
+        is_full_quantize = True
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 10
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
index 2b331308de5ee9a8aa52a9e303bfbcf8d4264d5f..a5f08ca969ac43f47899395aeb588ddaf2f1e394 100644
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
@@ -92,9 +92,11 @@ class TestWeightDecay(unittest.TestCase):
         return param_sum
 
     def check_weight_decay(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
+
         with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
             data = fluid.layers.data(
                 name="words", shape=[1], dtype="int64", lod_level=1)
@@ -113,9 +115,11 @@ class TestWeightDecay(unittest.TestCase):
         return param_sum
 
     def check_weight_decay2(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
+
         with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
             data = fluid.layers.data(
                 name="words", shape=[1], dtype="int64", lod_level=1)
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index d87363abf14cdfc3e29567bd41dbac387b882f76..a05aa3b0a84b57bb1f9ce00b0ad007280c316c6e 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -201,10 +201,13 @@ def pre_load(dso_name):
 
 
 def get_glibc_ver():
-    return run_shell_command("ldd --version | awk '/ldd/{print $NF}'").strip()
+    return run_shell_command("ldd --version | awk '/ldd/{print $NF}'")
 
 
 def less_than_ver(a, b):
+    if a is None or b is None:
+        return False
+
     import re
     import operator
 
diff --git a/python/paddle/fluid/data.py b/python/paddle/fluid/data.py
index 2c75c493cba02dc21a5e2518a8a5e52b6eb4fd81..dc57e9f71ed3d0de1a374bdf719b32a083198b31 100644
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
@@ -18,17 +18,14 @@ import six
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.data_feeder import check_dtype, check_type
+from ..utils import deprecated
 
 __all__ = ['data']
 
 
+@deprecated(since="2.0.0", update_to="paddle.static.data")
 def data(name, shape, dtype='float32', lod_level=0):
     """
-    :api_attr: Static Graph
-	:alias_main: paddle.nn.data
-	:alias: paddle.nn.data,paddle.nn.input.data
-	:old_api: paddle.fluid.data
-
     **Data Layer**
 
     This function creates a variable on the global block. The global variable
@@ -52,7 +49,7 @@ def data(name, shape, dtype='float32', lod_level=0):
 
         The default :code:`stop_gradient` attribute of the Variable created by
         this API is true, which means the gradient won't be passed backward
-        through the data Varaible. Set :code:`var.stop_gradient = False` If
+        through the data Variable. Set :code:`var.stop_gradient = False` If
         user would like to pass backward gradient.
 
     Args:
@@ -88,7 +85,7 @@ def data(name, shape, dtype='float32', lod_level=0):
 
           z = x + y
 
-          # In this example, we will feed x and y with np-ndarry "1"
+          # In this example, we will feed x and y with np-ndarray "1"
           # and fetch z, like implementing "1 + 1 = 2" in PaddlePaddle
           feed_data = np.ones(shape=[3, 2, 1], dtype=np.float32)
 
diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py
index 8043237c0d97d4be09e675464c01a6a9cc70392c..1d180329b72510de5e7e9362e4c002f4508ba1be 100644
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 from __future__ import division
 
 import numpy as np
-from .sampler import Sampler, SequenceSampler
+from .sampler import Sampler, SequenceSampler, RandomSampler
 from .dataset import Dataset, IterableDataset
 
 __all__ = ["BatchSampler"]
@@ -86,7 +86,6 @@ class BatchSampler(Sampler):
             # init with sampler
             sampler = RandomSampler(RandomDataset(100))
             bs = BatchSampler(sampler=sampler,
-                              shuffle=True,
                               batch_size=8,
                               drop_last=True)
 
@@ -118,14 +117,16 @@ class BatchSampler(Sampler):
                 "dataset should not be a paddle.io.IterableDataset"
             assert sampler is None, \
                 "should not set both dataset and sampler"
-            self.sampler = SequenceSampler(dataset)
+            assert isinstance(shuffle, bool), \
+                "shuffle should be a boolean value, but got {}".format(type(shuffle))
+            if shuffle:
+                self.sampler = RandomSampler(dataset)
+            else:
+                self.sampler = SequenceSampler(dataset)
 
         assert isinstance(batch_size, int) and batch_size > 0, \
             "batch_size should be a positive integer, but got {}".format(batch_size)
         self.batch_size = batch_size
-        assert isinstance(shuffle, bool), \
-            "shuffle should be a boolean value, but got {}".format(type(shuffle))
-        self.shuffle = shuffle
         assert isinstance(drop_last, bool), \
             "drop_last should be a boolean value, but got {}".format(type(drop_last))
         self.drop_last = drop_last
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index a81d73d7e9a621d2a02ed91541f32b827bdff38c..6a996493e4df1e1facc6ccd205a8ae5105f92c5b 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -30,7 +30,8 @@ if six.PY2:
 else:
     import queue
 
-from .. import core
+import paddle
+from .. import core, layers
 from ..framework import in_dygraph_mode
 from ..multiprocess_utils import CleanupFuncRegistrar, _cleanup_mmap, _set_SIGCHLD_handler
 from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher
@@ -79,7 +80,13 @@ def default_collate_fn(batch):
                 slots.append([item])
             else:
                 slots[i].append(item)
-    return [np.stack(slot, axis=0) for slot in slots]
+
+    if isinstance(slots[0][0], np.ndarray):
+        return [np.stack(slot, axis=0) for slot in slots]
+    elif isinstance(slots[0][0], paddle.Tensor):
+        return [layers.stack(slot, axis=0) for slot in slots]
+    else:
+        raise RuntimeError("Unknown data type {}".format(type(slots[0][0])))
 
 
 class _DatasetKind(object):
@@ -284,6 +291,12 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
                 for slot in batch:
                     if not isinstance(slot, core.LoDTensor):
                         self._check_input_array(slot)
+                        # FIXME(dkp): blocking_queue only support
+                        #             core.LoDTensorArray as input now, read
+                        #             numpy data into a LoDTensorArray here,
+                        #             should support paddle.Tensor list later
+                        if isinstance(slot, paddle.Tensor):
+                            slot = slot.numpy()
                         tmp = core.LoDTensor()
                         tmp.set(slot, core.CPUPlace())
                         slot = tmp
@@ -305,6 +318,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
 
     @classmethod
     def _check_input_array(cls, item):
+        if isinstance(item, paddle.Tensor):
+            return
         arr = np.array(item)
         if arr.dtype == np.object:
             raise TypeError((
@@ -530,6 +545,14 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                         out_queue.put((idx, e))
                 else:
                     if self._use_shared_memory:
+                        # FIXME(dkp): _convert_to_tensor_list only support np.array
+                        #             list now, should support paddle.Tensor list
+                        if isinstance(batch[0][0], paddle.Tensor):
+                            np_batch = []
+                            for sample in batch:
+                                np_batch.append([s.numpy() for s in sample])
+                            batch = np_batch
+
                         tensor_list = core._convert_to_tensor_list(batch)
                         out_queue.put((idx, tensor_list))
                         core._remove_tensor_list_mmap_fds(tensor_list)
@@ -585,22 +608,24 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             # in _send_idx but will not increase _rcvd_idx, so we check 
             # whether the worker is still alive here to skip the discarded
             # batch indices and increase _rcvd_idx
-            while self._rcvd_idx < self._send_idx:
-                info = self._task_infos[self._rcvd_idx]
-                if len(info) == 2 or self._worker_status[info[0]]:
-                    break
-                del self._task_infos[self._rcvd_idx]
-                self._rcvd_idx += 1
-                self._batches_outstanding -= 1
-            else:
-                # NOTE: _rcvd_idx and _send_idx only record batches among
-                #       workers, if batches among workers drained, there
-                #       may also be data in blocking queue
-                if self._batches_outstanding < len(self._places):
-                    return None
-                continue
-
-            if len(self._task_infos[self._rcvd_idx]) == 2:
+            if self._dataset_kind == _DatasetKind.ITER:
+                while self._rcvd_idx < self._send_idx:
+                    info = self._task_infos[self._rcvd_idx]
+                    if len(info) == 2 or self._worker_status[info[0]]:
+                        break
+                    del self._task_infos[self._rcvd_idx]
+                    self._rcvd_idx += 1
+                    self._batches_outstanding -= 1
+                else:
+                    # NOTE: _rcvd_idx and _send_idx only record batches among
+                    #       workers, if batches among workers drained, there
+                    #       may also be data in blocking queue
+                    if self._batches_outstanding < len(self._places):
+                        return None
+                    continue
+
+            if self._rcvd_idx in self._task_infos and \
+                    len(self._task_infos[self._rcvd_idx]) == 2:
                 return self._task_infos.pop(self._rcvd_idx)[1]
 
             try:
diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py
index e47f57381c0decaf365df58e96d6386a01f9ef1e..13bb946a5ebca09686fc7f56b2f7c5b068ea3148 100644
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@@ -14,9 +14,10 @@
 
 from __future__ import print_function
 
+from .. import framework
 import paddle.dataset.common
 
-__all__ = ["Dataset", "IterableDataset"]
+__all__ = ["Dataset", "IterableDataset", "TensorDataset"]
 
 
 class Dataset(object):
@@ -222,3 +223,55 @@ class IterableDataset(Dataset):
     def __len__(self):
         raise RuntimeError("'{}' should not be called for IterableDataset" \
                 "{}".format('__len__', self.__class__.__name__))
+
+
+class TensorDataset(Dataset):
+    """
+    Dataset defined by a list of tensors.
+
+    Each tensor should be in shape of [N, ...], while N is the sample number,
+    and ecah tensor contains a field of sample, :code:`TensorDataset` retrieve
+    each sample by indexing tensors in the 1st dimension.
+
+    Args:
+        tensors(list of Tensor): tensors with same shape in the 1st dimension.
+
+    Returns:
+        Dataset: a Dataset instance wrapping tensors.
+
+    Examples:
+
+        .. code-block:: python
+        
+            import numpy as np
+            import paddle
+            from paddle.io import TensorDataset
+
+            paddle.disable_static()
+
+            input_np = np.random.random([2, 3, 4]).astype('float32')
+            input = paddle.to_tensor(input_np)
+            label_np = np.random.random([2, 1]).astype('int32')
+            label = paddle.to_tensor(label_np)
+
+            dataset = TensorDataset([input, label])
+
+            for i in range(len(dataset)):
+                input, label = dataset[i]
+                print(input, label)
+
+    """
+
+    def __init__(self, tensors):
+        if not framework.in_dygraph_mode():
+            raise RuntimeError(
+                "TensorDataset con only be used in imperative mode")
+        assert all([tensor.shape[0] == tensors[0].shape[0] for tensor in tensors]), \
+                "tensors not have same shape of the 1st dimension"
+        self.tensors = tensors
+
+    def __getitem__(self, index):
+        return tuple(tensor[index] for tensor in self.tensors)
+
+    def __len__(self):
+        return self.tensors[0].shape[0]
diff --git a/python/paddle/fluid/dataloader/sampler.py b/python/paddle/fluid/dataloader/sampler.py
index d2f3231cc6b12f1190ad9453ea48213edc9122b8..5c75fafe8b22380090ba6fb580777cdbe6570ad6 100644
--- a/python/paddle/fluid/dataloader/sampler.py
+++ b/python/paddle/fluid/dataloader/sampler.py
@@ -177,7 +177,7 @@ class RandomSampler(Sampler):
                 def __len__(self):
                     return self.num_samples
 
-            sampler = RandomSampler(data_souce=RandomDataset(100))
+            sampler = RandomSampler(data_source=RandomDataset(100))
 
             for index in sampler:
                 print(index)
@@ -216,7 +216,11 @@ class RandomSampler(Sampler):
     def __iter__(self):
         n = len(self.data_source)
         if self.generator:
-            for index in self.generator:
+            for i in range(self.num_samples):
+                try:
+                    index = next(self.generator)
+                except StopIteration:
+                    return
                 yield index
         else:
             if self.replacement:
diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
index 20f48db0808b04d09ddd4537cbec2af939ad7692..cf270ced3b704179856b1ab04dbeae8a04fbc589 100644
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -38,9 +38,6 @@ from .checkpoint import *
 from . import learning_rate_scheduler
 from .learning_rate_scheduler import *
 
-from . import backward_strategy
-from .backward_strategy import *
-
 from . import jit
 from .jit import *
 
@@ -59,6 +56,8 @@ from .rnn import *
 from . import amp
 from .amp import *
 
+from .math_op_patch import monkey_patch_math_varbase
+
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
@@ -67,7 +66,6 @@ __all__ += nn.__all__
 __all__ += parallel.__all__
 __all__ += checkpoint.__all__
 __all__ += learning_rate_scheduler.__all__
-__all__ += backward_strategy.__all__
 __all__ += jit.__all__
 __all__ += io.__all__
 __all__ += rnn.__all__
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index a6c9f90ea6c94cb4fcfb664f03918b7cc7ebe8e0..2174dbd31b8fb1ae97894699e03e25e809085cc8 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -91,8 +91,8 @@ def enabled():
     """
     This function checks whether the program runs in dynamic graph mode or not.
     You can enter dynamic graph mode with :ref:`api_fluid_dygraph_guard` api,
-    or enable and disable dynamic graph mode with :ref:`api_fluid_dygraph_enable`
-    and :ref:`api_fluid_dygraph_disable` api .
+    or enable and disable dynamic graph mode with :ref:`api_fluid_dygraph_enable_dygraph`
+    and :ref:`api_fluid_dygraph_disable_dygraph` api .
 
     **Note**:
         ``fluid.dygraph.enabled`` is the alias of ``fluid.in_dygraph_mode``, and
@@ -182,19 +182,19 @@ class no_grad:
      .. code-block:: python
 
         import numpy as np
-        import paddle.fluid as fluid
+        import paddle
 
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         # use as generator
 
         data = np.array([[2, 3], [4, 5]]).astype('float32')
-        l0 = fluid.Linear(2, 2)  # l0.weight.gradient() is None
-        l1 = fluid.Linear(2, 2)
-        with fluid.no_grad():
+        l0 = paddle.nn.Linear(2, 2)  # l0.weight.gradient() is None
+        l1 = paddle.nn.Linear(2, 2)
+        with paddle.no_grad():
             # l1.weight.stop_gradient is False
             tmp = l1.weight * 2  # tmp.stop_gradient is True
-        x = fluid.dygraph.to_variable(data)
+        x = paddle.to_tensor(data)
         y = l0(x) + tmp
         o = l1(y)
         o.backward()
@@ -203,12 +203,12 @@ class no_grad:
 
         # use as decorator
 
-        @fluid.no_grad()
+        @paddle.no_grad()
         def test_layer():
             inp = np.ones([3, 1024], dtype='float32')
-            t = fluid.dygraph.base.to_variable(inp)
-            linear1 = fluid.Linear(1024, 4, bias_attr=False)
-            linear2 = fluid.Linear(4, 4)
+            t = paddle.to_tensor(inp)
+            linear1 = paddle.nn.Linear(1024, 4, bias_attr=False)
+            linear2 = paddle.nn.Linear(4, 4)
             ret = linear1(t)
             dy_ret = linear2(ret)
 
@@ -319,8 +319,7 @@ def grad(outputs,
          create_graph=False,
          only_inputs=True,
          allow_unused=False,
-         no_grad_vars=None,
-         backward_strategy=None):
+         no_grad_vars=None):
     ''' 
     .. note::
         **This API is ONLY available in Dygraph mode.**
@@ -328,19 +327,19 @@ def grad(outputs,
     This API computes the sum of gradients of `outputs` with respect to each `inputs` .
 
     Parameters:
-        outputs (Variable|list(Variable)|tuple(Variable)): the output Variable or 
-            Variable list/tuple of the graph to compute gradients.
-        inputs (Variable|list(Variable)|tuple(Variable)): the input Variable or 
-            Variable list/tuple of the graph to compute gradients. The returned
+        outputs (Tensor|list(Tensor)|tuple(Tensor)): the output Tensor or 
+            Tensor list/tuple of the graph to compute gradients.
+        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
+            Tensor list/tuple of the graph to compute gradients. The returned
             values of this API are the gradients of `inputs` . 
-        grad_outputs (Variable|list(Variable|None)|tuple(Variable|None), optional): 
+        grad_outputs (Tensor|list(Tensor|None)|tuple(Tensor|None), optional): 
             initial gradient values of `outputs` . If `grad_outputs` is None, 
             the initial gradient values of `outputs` would be Tensors filled with 1; 
             if `grad_outputs` is not None, it must have the same length as `outputs` , 
             and in this case, the initial gradient value of the i-th `outputs` would
             be: (1) a Tensor filled with 1 when the i-th element of `grad_outputs` 
             is None; (2) the i-th element of `grad_outputs` when the i-th element of
-            `grad_outputs` is a Variable. Default None.
+            `grad_outputs` is a Tensor. Default None.
         retain_graph (bool, optional): whether to retain the forward graph which 
             is used to calculate the gradient. When it is True, the graph would 
             be retained, in which way users can calculate backward twice for the 
@@ -352,70 +351,66 @@ def grad(outputs,
             computing process would be discarded. Default False.
         only_inputs (bool, optional): whether to only compute the gradients of
             `inputs` . If it is False, the gradients of all remaining leaf 
-            Variables in the graph would be also computed and accumulated. 
+            Tensors in the graph would be also computed and accumulated. 
             If it is True, only the gradients of `inputs` would be computed.
             Default True. only_inputs=False is under development, and it is
             not supported yet.    
         allow_unused (bool, optional): whether to raise error or return None if some 
-            Variables of `inputs` are unreachable in the graph. If some Variables of 
+            Tensors of `inputs` are unreachable in the graph. If some Tensors of 
             `inputs` are unreachable in the graph (i.e., their gradients are None),  
             error would be raised if allow_unused=False, or None would be returned as
             their gradients if allow_unused=True. Default False.
-        no_grad_vars (Variable|list(Variable)|tuple(Variable)|set(Variable), optional): 
-            the Variables whose gradients are not needed to compute. Default None.
-        backward_strategy (BackwardStrategy, optional): The backward strategy to
-            compute gradients. See :ref:`api_fluid_dygraph_BackwardStrategy` for
-            details. Default None.
+        no_grad_vars (Tensor|list(Tensor)|tuple(Tensor)|set(Tensor), optional): 
+            the Tensors whose gradients are not needed to compute. Default None.
 
     Returns:
-        tuple: a tuple of Variables, whose length is the same as the Variable number 
-        inside `inputs`, and the i-th returned Variable is the sum of gradients of 
+        tuple: a tuple of Tensors, whose length is the same as the Tensor number 
+        inside `inputs`, and the i-th returned Tensor is the sum of gradients of 
         `outputs` with respect to the i-th `inputs`.
 
     Examples 1:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
+            paddle.disable_static()
 
             def test_dygraph_grad(create_graph):
-                with fluid.dygraph.guard(): 
-                    x = fluid.layers.ones(shape=[1], dtype='float32') 
-                    x.stop_gradient = False
-                    y = x * x
-
-                    # Since y = x * x, dx = 2 * x 
-                    dx = fluid.dygraph.grad(
-                            outputs=[y],
-                            inputs=[x], 
-                            create_graph=create_graph, 
-                            retain_graph=True)[0]
-
-                    z = y + dx
-
-                    # If create_graph = False, the gradient of dx
-                    # would not be backpropagated. Therefore,
-                    # z = x * x + dx, and x.gradient() = 2 * x = 2.0
-                    
-                    # If create_graph = True, the gradient of dx
-                    # would be backpropagated. Therefore, 
-                    # z = x * x + dx = x * x + 2 * x, and
-                    # x.gradient() = 2 * x + 2 = 4.0 
-
-                    z.backward()
-                    return x.gradient() 
-
-            print(test_dygraph_grad(create_graph=False)) # [2.] 
+                x = paddle.ones(shape=[1], dtype='float32')
+                x.stop_gradient = False
+                y = x * x
+
+                # Since y = x * x, dx = 2 * x
+                dx = paddle.grad(
+                        outputs=[y],
+                        inputs=[x],
+                        create_graph=create_graph,
+                        retain_graph=True)[0]
+
+                z = y + dx
+
+                # If create_graph = False, the gradient of dx
+                # would not be backpropagated. Therefore,
+                # z = x * x + dx, and x.gradient() = 2 * x = 2.0
+
+                # If create_graph = True, the gradient of dx
+                # would be backpropagated. Therefore,
+                # z = x * x + dx = x * x + 2 * x, and
+                # x.gradient() = 2 * x + 2 = 4.0
+
+                z.backward()
+                return x.gradient()
+
+            print(test_dygraph_grad(create_graph=False)) # [2.]
             print(test_dygraph_grad(create_graph=True)) # [4.]
 
     Examples 2:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            fluid.enable_dygraph()
+            import paddle
+            paddle.disable_static()
 
             def test_dygraph_grad(grad_outputs=None):
-                x = fluid.layers.fill_constant(shape=[1], value=2.0, dtype='float32')
+                x = paddle.fill_constant(shape=[1], value=2.0, dtype='float32')
                 x.stop_gradient = False
 
                 y1 = x * x
@@ -431,27 +426,27 @@ def grad(outputs,
                 # Therefore, the final result would be:
                 # dx = 2 * x * dy1 + 3 * dy2 = 4 * dy1 + 3 * dy2.
 
-                dx = fluid.dygraph.grad(
+                dx = paddle.grad(
                     outputs=[y1, y2], 
                     inputs=[x],
                     grad_outputs=grad_outputs)[0]
 
                 return dx.numpy()
 
-            THREE = fluid.layers.fill_constant(shape=[1], value=3.0, dtype='float32')
-            FOUR = fluid.layers.fill_constant(shape=[1], value=4.0, dtype='float32')
+            grad_value = paddle.fill_constant(shape=[1], value=4.0, dtype='float32')
 
             # dy1 = [1], dy2 = [1]
             print(test_dygraph_grad(None)) # [7.]
 
             # dy1 = [1], dy2 = [4]
-            print(test_dygraph_grad([None, FOUR])) # [16.] 
+            print(test_dygraph_grad([None, grad_value])) # [16.]
 
             # dy1 = [4], dy2 = [1]
-            print(test_dygraph_grad([FOUR, None])) # [19.]
+            print(test_dygraph_grad([grad_value, None])) # [19.]
 
             # dy1 = [3], dy2 = [4]
-            print(test_dygraph_grad([THREE, FOUR])) # [24.]
+            grad_y1 = paddle.fill_constant(shape=[1], value=3.0, dtype='float32')
+            print(test_dygraph_grad([grad_y1, grad_value])) # [24.]
 	'''
 
     def check_in_out(in_out_list, name):
@@ -504,12 +499,6 @@ def grad(outputs,
         raise AssertionError(
             "no_grad_vars must be None, Variable or list/tuple/set of Variables")
 
-    if backward_strategy is None:
-        backward_strategy = core.BackwardStrategy()
-
-    assert isinstance(backward_strategy, core.BackwardStrategy), \
-        "backward_strategy must be type paddle.fluid.dygraph.BackwardStrategy"
-
     assert isinstance(create_graph, bool), "create_graph must be True or False"
 
     if retain_graph is None:
@@ -525,9 +514,9 @@ def grad(outputs,
 
     place = core.Place()
     place.set_place(framework._current_expected_place())
-    return core.dygraph_partial_grad(
-        inputs, outputs, grad_outputs, no_grad_vars, place, backward_strategy,
-        create_graph, retain_graph, allow_unused, only_inputs)
+    return core.dygraph_partial_grad(inputs, outputs, grad_outputs,
+                                     no_grad_vars, place, create_graph,
+                                     retain_graph, allow_unused, only_inputs)
 
 
 @framework.dygraph_only
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/__init__.py b/python/paddle/fluid/dygraph/dygraph_to_static/__init__.py
index 1f91027e462d3437b0ef01455aa037cb38d8b658..9608910ee8d6223ea8e7bab06d5db90632cc2be0 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/__init__.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/__init__.py
@@ -34,6 +34,9 @@ from .convert_call_func import *
 
 from . import convert_operators
 
+from . import logging_utils
+from .logging_utils import *
+
 __all__ = []
 __all__ += ast_transformer.__all__
 __all__ += loop_transformer.__all__
@@ -41,3 +44,4 @@ __all__ += static_analysis.__all__
 __all__ += variable_trans_func.__all__
 __all__ += program_translator.__all__
 __all__ += convert_call_func.__all__
+__all__ += logging_utils.__all__
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index f859d40050c73d276bf9940d904c656debd35c82..5152799ca72f1461d6fbfc3a619a6aa9b9477934 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 # as produced by ast.parse from the standard ast module.
 # See details in https://github.com/serge-sans-paille/gast/
 import gast
-
 from paddle.fluid.dygraph.dygraph_to_static.assert_transformer import AssertTransformer
 from paddle.fluid.dygraph.dygraph_to_static.basic_api_transformer import BasicApiTransformer
 from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import BreakContinueTransformer
@@ -31,14 +30,16 @@ from paddle.fluid.dygraph.dygraph_to_static.logical_transformer import LogicalTr
 from paddle.fluid.dygraph.dygraph_to_static.loop_transformer import LoopTransformer
 from paddle.fluid.dygraph.dygraph_to_static.print_transformer import PrintTransformer
 from paddle.fluid.dygraph.dygraph_to_static.return_transformer import ReturnTransformer
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
 from paddle.fluid.dygraph.dygraph_to_static.tensor_shape_transformer import TensorShapeTransformer
 
-from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
+from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import get_attribute_full_name
 
 __all__ = ['DygraphToStaticAst']
 
-DECORATOR_NAMES = ['declarative', 'dygraph_to_static_func']
+DECORATOR_NAMES = ['declarative', 'to_static', 'dygraph_to_static_func']
 
 
 class DygraphToStaticAst(gast.NodeTransformer):
@@ -57,45 +58,70 @@ class DygraphToStaticAst(gast.NodeTransformer):
         return self.static_analysis_root
 
     def transfer_from_node_type(self, node_wrapper):
+        translator_logger = logging_utils.TranslatorLogger()
+        translator_logger.log(
+            1, "   Source code: \n{}".format(ast_to_source_code(self.root)))
         # Generic transformation
         self.visit(node_wrapper.node)
 
         # Transform basic api of dygraph to static graph and get feed_name_to_arg_name
-        basic_api_trans = BasicApiTransformer(node_wrapper)
-        basic_api_trans.transform()
+        BasicApiTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(1, self.root,
+                                               "BasicApiTransformer")
 
         # Transform Tensor.shape into fluid.layers.shape(Tensor)
         TensorShapeTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(2, self.root,
+                                               "TensorShapeTransformer")
 
         # Transform list used in control flow
         ListTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(3, self.root, "ListTransformer")
 
         # Transform break/continue in loops
         BreakContinueTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(4, self.root,
+                                               "BreakContinueTransformer")
 
         # Transform return in functions
         ReturnTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(5, self.root,
+                                               "ReturnTransformer")
 
         # Transform logical and/or/not
         LogicalTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(6, self.root,
+                                               "LogicalTransformer")
 
         # Transform for loop and while loop
         LoopTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(7, self.root, "LoopTransformer")
 
         # Transform all if/else statement of Dygraph into Static Graph.
         IfElseTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(8, self.root,
+                                               "IfElseTransformer")
 
         # Transform python assert statement
         AssertTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(9, self.root,
+                                               "AssertTransformer")
 
         # Transform all python print statement
         PrintTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(10, self.root,
+                                               "PrintTransformer")
 
         # Transform call recursively
         CallTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(11, self.root, "CallTransformer")
 
         # Transform python type casting statement
         CastTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(12, self.root, "CastTransformer")
+
+        translator_logger.log_transformed_code(logging_utils.LOG_AllTransformer,
+                                               self.root, "All Transformers")
 
     def visit_FunctionDef(self, node):
         if self.decorate_func_name is None:
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
index 4ba1d302576df695c5b2e867452b91b3d1d2844a..7fc72d42759b0f8029ac6adfc7b9670fbffc67d5 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
@@ -19,6 +19,8 @@ from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrappe
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import is_paddle_api
 
+PDB_SET = "pdb.set_trace"
+
 
 class CallTransformer(gast.NodeTransformer):
     """
@@ -62,6 +64,12 @@ class CallTransformer(gast.NodeTransformer):
             return node
 
         func_str = ast_to_source_code(node.func).strip()
+
+        # NOTE(liym27): Don't convert `pad.set_trace` even if the convertion doesn't work finally, because
+        # it is clearer to see where it is called from.
+        if PDB_SET in func_str:
+            return node
+
         new_func_str = "fluid.dygraph.dygraph_to_static.convert_call({})".format(
             func_str)
         new_func_ast = gast.parse(new_func_str).body[0].value
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index edd7dfcf93977bb3244c0c1676715a65dba88dc2..4630cfcdabfd307ea03a7fd0c885c73ce4a4ea0b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -27,13 +27,16 @@ import types
 import numpy
 import six
 
-from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
-from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len
+from paddle.fluid.dygraph.dygraph_to_static.logging_utils import TranslatorLogger
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticLayer
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import convert_to_static
+from paddle.fluid.dygraph.layers import Layer
 
-DECORATOR_NAMES = ['declarative', 'dygraph_to_static_func']
-program_translator = ProgramTranslator()
-to_static_func = program_translator.get_func
+# TODO(liym27): A better way to do this.
+BUILTIN_LIKELY_MODULES = [collections, pdb, copy, inspect, re, six, numpy]
+
+translator_logger = TranslatorLogger()
 
 
 def is_builtin(func):
@@ -41,11 +44,6 @@ def is_builtin(func):
         return True
     elif func in six.moves.builtins.__dict__.values():
         return True
-    # Other built-in modules
-    # TODO(liym27): A better way to do this.
-    elif any(func in m.__dict__.values()
-             for m in (collections, pdb, copy, inspect, re, six, numpy)):
-        return True
     else:
         return False
 
@@ -61,9 +59,29 @@ def is_paddle_func(func):
     return m is not None and m.__name__.startswith("paddle")
 
 
+def is_unsupported(func):
+    """
+    Checks whether the func is supported by dygraph to static graph.
+    """
+
+    if any(func in m.__dict__.values() for m in BUILTIN_LIKELY_MODULES):
+        translator_logger.log(
+            2,
+            "Whitelist: {} is part of built-in module and does not have to be transformed.".
+            format(func))
+        return True
+
+    if is_paddle_func(func):
+        translator_logger.log(
+            2,
+            "Whitelist: {} is part of Paddle module and does not have to be transformed.".
+            format(func))
+        return True
+
+
 def convert_call(func):
     """
-    Converts a function call which needs to be transformed to static fucntion.
+    Converts a function call which needs to be transformed to static function.
 
     Args:
         func (callable): A callable function or method to convert.
@@ -95,13 +113,24 @@ def convert_call(func):
           #  [1. 1. 1.]]
 
     """
+    translator_logger.log(1,
+                          "Convert callable object: convert {}.".format(func))
     func_self = None
     converted_call = None
 
+    # Function in convert_call may be decorated by another `@declarative`,
+    # in this case, unwraps it into a raw method or function.
+    if isinstance(func, StaticLayer):
+        instance = func._class_instance
+        if instance is not None:
+            func = func.dygraph_function.__get__(instance)
+        else:
+            func = func.dygraph_function
+
     if is_builtin_len(func):
         return convert_len
 
-    if is_builtin(func) or is_paddle_func(func):
+    if is_builtin(func) or is_unsupported(func):
         return func
 
     if inspect.isfunction(func):
@@ -109,12 +138,36 @@ def convert_call(func):
         if func.__name__ == '<lambda>':
             return func
         try:
-            global_funcs = set([
-                fn for fn in func.__globals__.values() if inspect.isfunction(fn)
-            ])
-            if func in global_funcs:
-                converted_call = to_static_func(func)
+            # Note(Aurelius84): Because `@declarative` returns a class instance instead of
+            # a function. This will modify the value referring to itself in `__globals__`.
+
+            # For example: 
+            #
+            #      @declarative
+            #      def foo(x):
+            #          return x
+            #
+            # `foo` will be converted into a wrapper class, suppose as `StaticLayer`.
+            # And `foo.__globals__['foo']` will still return this `StaticLayer` instead of
+            # `foo` function. So `isinstance(fn, StaticLayer)` is added here. 
+            global_functions = set()
+            for fn in func.__globals__.values():
+                if inspect.isfunction(fn):
+                    global_functions.add(fn)
+                elif isinstance(fn, StaticLayer):
+                    global_functions.add(fn.dygraph_function)
+
+            if func in global_functions:
+                converted_call = convert_to_static(func)
                 func_self = getattr(func, '__self__', None)
+            else:
+                # NOTE:
+                # If func is not in __globals__, it does not need to be transformed
+                # because it has been transformed before.
+                translator_logger.warn(
+                    "{} doesn't have to be transformed to static function because it has been transformed before, it will be run as-is."
+                    .format(func))
+                converted_call = func
         except AttributeError:
             # NOTE:
             # If func is not in __globals__, it does not need to be transformed
@@ -127,7 +180,7 @@ def convert_call(func):
             converted_call = None
     elif inspect.ismethod(func):
         try:
-            converted_call = to_static_func(func)
+            converted_call = convert_to_static(func)
             func_self = getattr(func, '__self__', None)
         except (IOError, OSError):
             # NOTE: func may have been decorated.
@@ -136,7 +189,7 @@ def convert_call(func):
     elif hasattr(func, '__class__') and hasattr(func.__class__, '__call__'):
         if hasattr(func, 'forward') and isinstance(func, Layer):
             try:
-                forward_func = to_static_func(func.forward)
+                forward_func = convert_to_static(func.forward)
                 setattr(func, 'forward', forward_func)
                 func_self = func
             except Exception:
@@ -146,15 +199,21 @@ def convert_call(func):
         else:
             try:
                 call_func = func.__class__.__call__
-                converted_call = to_static_func(call_func)
+                converted_call = convert_to_static(call_func)
                 func_self = func
             except Exception:
                 # NOTE:
                 # If `func` is a class which is being initialized, for example `convert_call(Foo)()`,
                 # it doesn't need to be transformed
                 func_self = None if func_self else func_self
+    else:
+        raise NotImplementedError(
+            "Callable {} can not be transformed at present.".format(func))
 
     if converted_call is None:
+        translator_logger.warn(
+            "{} doesn't have to be transformed to static function, and it will be run as-is."
+            .format(func))
         return func
 
     if func_self:
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..5540c63a85bd7f8db760f0c3e25be9eefa2aace7
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import six
+import inspect
+import numpy as np
+import collections
+import paddle
+from paddle.fluid import core
+from paddle.fluid.dygraph import layers
+from paddle.fluid.layers.utils import flatten
+from paddle.fluid.layers.utils import pack_sequence_as
+from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
+from paddle.fluid.dygraph.dygraph_to_static.utils import type_name
+from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
+
+
+class FunctionSpec(object):
+    """
+    Wrapper class for a function for class method.
+    """
+
+    def __init__(self, function, input_spec=None):
+        self._dygraph_function = function
+        if input_spec is None:
+            self._input_spec = None
+            self._flat_input_spec = None
+        else:
+            self._input_spec = self._verify_input_spec(input_spec)
+            self._flat_input_spec = flatten(self._input_spec)
+
+        # parse full argument names list.
+        self._arg_names, self._default_kwargs = parse_arg_and_kwargs(function)
+
+    def unified_args_and_kwargs(self, args, kwargs):
+        """
+        Moves kwargs with default value into arguments list to keep `args` contain the same length
+        value as function definition.
+        
+        For example: 
+        
+            Given function definition: `def foo(x, a=1, b=2)`, 
+            when calling it by `foo(23)`, the args is `[23]`, kwargs is `{a=1, b=2}`.
+            In this function, it will return args with `[23, 1, 2]`, kwargs with `{}`
+
+        Args:
+            args(tuple): tuple of input arguments value of decorated function.
+            kwargs(dict): dict of input keyword arguments value of decorated function.
+
+        Return:
+            New arguments tuple containing default kwargs value.
+        """
+        if len(self._arg_names) < len(args):
+            error_msg = "The decorated function `{}` requires {} arguments: {}, but received {} with {}.".format(
+                self._dygraph_function.__name__,
+                len(self._arg_names), self._arg_names, len(args), args)
+            if args and inspect.isclass(args[0]):
+                error_msg += "\n\tMaybe the function has more than one decorator, we don't support this for now."
+                raise NotImplementedError(error_msg)
+            else:
+                raise ValueError(error_msg)
+
+        args = list(args)
+
+        for i in six.moves.range(len(args), len(self._arg_names)):
+            arg_name = self._arg_names[i]
+            if arg_name in kwargs:
+                args.append(kwargs[arg_name])
+                del kwargs[arg_name]
+            else:
+                if arg_name not in self._default_kwargs:
+                    raise ValueError(
+                        "`{}()` requires `{}` arguments, but not found in input `args`: {} and `kwargs`: {}.".
+                        format(self._dygraph_function.__name__, arg_name, args,
+                               kwargs))
+                args.append(self._default_kwargs[arg_name])
+
+        return tuple(args), kwargs
+
+    def args_to_input_spec(self, args, kwargs):
+        """
+        Converts input arguments into InputSpec.
+        
+        1. If specific input_spec, use them to construct feed layers.
+        2. If input_spec is None, consider all Tensor and Numpy.ndarray as feed layers
+
+        Args:
+            args(tuple): tuple of input arguments value of function containing default kwargs value.
+            kwargs(dict): kwargs arguments received by **kwargs.
+
+        Return:
+            Same nest structure with args by replacing value with InputSpec.
+        """
+        input_with_spec = []
+
+        if self._input_spec is not None:
+            # Note: Because the value type and length of `kwargs` is uncertain.
+            # So we don't support to deal this case while specificing `input_spec` currently.
+            if kwargs:
+                raise ValueError(
+                    "{} got unexpected keyword arguments: {}. Cannot trace the function when `input_spec` is specificed.".
+                    format(self._dygraph_function.__name__, kwargs))
+
+            # Note: The length of `input_spec` can be greater than `args`,
+            # because `args` may contains non-tensor value merged form `kwargs`
+            # after `unified_args_and_kwargs`.
+            if len(args) < len(self._input_spec):
+                raise ValueError(
+                    "Requires len(arguments) >= len(input_spec), but received len(args):{} < len(InputSpec): {}".
+                    format(len(args), len(self._input_spec)))
+
+            # replace argument with corresponding InputSpec.
+            input_with_spec = convert_to_input_spec(args, self._input_spec)
+        else:
+            for idx, input_var in enumerate(flatten(args)):
+                if isinstance(input_var, np.ndarray):
+                    input_var = paddle.static.InputSpec.from_numpy(input_var)
+                elif isinstance(input_var, core.VarBase):
+                    input_var = paddle.static.InputSpec.from_tensor(input_var)
+
+                input_with_spec.append(input_var)
+
+            input_with_spec = pack_sequence_as(args, input_with_spec)
+
+        return input_with_spec
+
+    @switch_to_static_graph
+    def to_static_inputs_with_spec(self, input_with_spec, main_program):
+        """
+        Constructs feed layer by inputs with InputSpec information for main program.
+
+        Args:
+            input_with_spec(tuple): input arguments by replacing argument with InputSpec.
+            main_program(Program): main program for inserting feed layer.
+        """
+        flat_input_spec = flatten(input_with_spec)
+
+        inputs = []
+        block = main_program.global_block()
+        for i, var_spec in enumerate(flat_input_spec):
+            if isinstance(var_spec, paddle.static.InputSpec):
+                feed_layer = block.create_var(
+                    # TODO(Aurelius84): consider a more elegant way to name this
+                    name=var_spec.name or "feed_%s" % i,
+                    shape=var_spec.shape,
+                    dtype=var_spec.dtype,
+                    is_data=True,
+                    need_check_feed=False)
+            else:
+                feed_layer = var_spec
+            inputs.append(feed_layer)
+
+        return pack_sequence_as(input_with_spec, inputs)
+
+    def _verify_input_spec(self, input_spec):
+        """
+        Verifies the `input_spec` and its element type is valid.
+        """
+        if not isinstance(input_spec, (tuple, list)):
+            raise TypeError(
+                "The type(input_spec) should be one of (tuple, list), but received {}.".
+                format(type_name(input_spec)))
+        input_spec = tuple(input_spec)
+        for spec in flatten(input_spec):
+            if not isinstance(spec, paddle.static.InputSpec):
+                raise ValueError(
+                    "The type(elem) from input_spec should be `InputSpec`, but received {}.".
+                    format(type_name(spec)))
+
+        return input_spec
+
+    def __repr__(self):
+        return "function: {}({}), input_spec: {}".format(
+            self._dygraph_function.__name__, ','.join(self._arg_names),
+            self._input_spec)
+
+    @property
+    def dygraph_function(self):
+        return self._dygraph_function
+
+    @property
+    def args_name(self):
+        return self._arg_names
+
+    @property
+    def input_spec(self):
+        return self._input_spec
+
+    @property
+    def flat_input_spec(self):
+        return self._flat_input_spec
+
+    @property
+    def code(self):
+        return func_to_source_code(self._dygraph_function)
+
+
+def get_parameters(layer_instance, include_sublayer=True):
+    """
+    Returns parameters of decorated layers. If set `include_sublayer` True,
+    the parameters created in sub layers will be added.
+    """
+    params = collections.OrderedDict()
+    if layer_instance is not None:
+        if isinstance(layer_instance, layers.Layer):
+            if include_sublayer:
+                params = layer_instance.parameters()
+                names = [p.name for p in params]
+                params = collections.OrderedDict(zip(names, params))
+            else:
+                params = layer_instance._parameters
+        else:
+            raise TypeError(
+                "Type of `layer_instance` should be nn.Layer, but received {}".
+                format(type_name(layer_instance)))
+
+    return params
+
+
+def get_buffers(layer_instance, include_sublayer=True):
+    """
+    Returns Variable buffers of decorated layers. If set `include_sublayer` True,
+    the Variable buffers created in sub layers will be added.
+    """
+    buffers = collections.OrderedDict()
+    if layer_instance is not None:
+        if isinstance(layer_instance, layers.Layer):
+            if include_sublayer:
+                buffers = layer_instance.buffers()
+                names = [buffer.name for buffer in buffers]
+                buffers = collections.OrderedDict(zip(names, buffers))
+            else:
+                buffers = layer_instance._buffers
+        else:
+            raise TypeError(
+                "Type of `layer_instance` should be nn.Layer, but received {}".
+                format(type_name(layer_instance)))
+    return buffers
+
+
+def convert_to_input_spec(inputs, input_spec):
+    """
+    Replaces tensor in structured `inputs` by InputSpec in `input_spec`.
+    
+    Args:
+        inputs(list|dict): nested structure list or dict.
+        input_spec(list|dict): same nested structure list or dict as inputs. 
+
+    
+    Return:
+        Same structure with inputs by replacing the element with specified InputSpec.
+    """
+
+    def check_type_and_len(input, spec, check_length=False):
+        if type(input) is not type(spec):
+            raise TypeError('type(input) should be {}, but received {}.'.format(
+                type(spec), type(input)))
+        if check_length and len(input) < len(spec):
+            raise ValueError(
+                'Requires len(inputs) >= len(input_spec), but received len(inputs):{} < len(input_spec):{}'.
+                format(len(inputs), len(input_spec)))
+
+    if isinstance(input_spec, (tuple, list)):
+        input_with_spec = []
+        check_type_and_len(inputs, input_spec, True)
+
+        for i, spec in enumerate(input_spec):
+            out_spec = convert_to_input_spec(inputs[i], spec)
+            input_with_spec.append(out_spec)
+
+        # Note: If the rest inputs contain tensor or numpy.ndarray
+        # without specific InputSpec, raise warning.
+        if len(inputs) > len(input_spec):
+            for rest_input in inputs[len(input_spec):]:
+                if isinstance(rest_input, (core.VarBase, np.ndarray)):
+                    logging.warning(
+                        "The inputs constain `{}` without specificing InputSpec, its shape and dtype will be treated immutable. "
+                        "Please specific InputSpec information in `@declarative` if you expect them as mutable inputs.".
+                        format(type_name(rest_input)))
+        input_with_spec.extend(inputs[len(input_spec):])
+
+        return input_with_spec
+    elif isinstance(input_spec, dict):
+        input_with_spec = {}
+        check_type_and_len(inputs, input_spec, True)
+        for name, input in inputs.items():
+            if name in input_spec:
+                input_with_spec[name] = convert_to_input_spec(input,
+                                                              input_spec[name])
+            else:
+                input_with_spec[name] = input
+        return input_with_spec
+    elif isinstance(input_spec, paddle.static.InputSpec):
+        return input_spec
+    else:
+        raise TypeError(
+            "The type(input_spec) should be a `InputSpec` or dict/list/tuple of it, but received {}.".
+            type_name(input_spec))
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..75cb65085846d672d2488c98bf6ad625ac12e78b
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import threading
+
+import six
+from paddle.fluid import log_helper
+from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
+
+__all__ = ["TranslatorLogger", "set_verbosity", "set_code_level"]
+
+VERBOSITY_ENV_NAME = 'TRANSLATOR_VERBOSITY'
+CODE_LEVEL_ENV_NAME = 'TRANSLATOR_CODE_LEVEL'
+DEFAULT_VERBOSITY = -1
+DEFAULT_CODE_LEVEL = -1
+
+
+def synchronized(func):
+    def wrapper(*args, **kwargs):
+        with threading.Lock():
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+class TranslatorLogger(object):
+    """
+    class for Logging and debugging during the tranformation from dygraph to static graph.
+    The object of this class is a singleton.
+    """
+
+    @synchronized
+    def __new__(cls, *args, **kwargs):
+        if not hasattr(cls, '_instance'):
+            cls._instance = object.__new__(cls, *args, **kwargs)
+            cls._instance._initialized = False
+        return cls._instance
+
+    def __init__(self):
+        if self._initialized:
+            return
+
+        self._initialized = True
+        self._logger = log_helper.get_logger(
+            __name__, 1, fmt='%(asctime)s-%(levelname)s: %(message)s')
+        self._verbosity_level = None
+        self._transformed_code_level = None
+
+    @property
+    def logger(self):
+        return self._logger
+
+    @property
+    def verbosity_level(self):
+        if self._verbosity_level is not None:
+            return self._verbosity_level
+        else:
+            return int(os.getenv(VERBOSITY_ENV_NAME, DEFAULT_VERBOSITY))
+
+    @verbosity_level.setter
+    def verbosity_level(self, level):
+        self.check_level(level)
+        self._verbosity_level = level
+
+    @property
+    def transformed_code_level(self):
+        if self._transformed_code_level is not None:
+            return self._transformed_code_level
+        else:
+            return int(os.getenv(CODE_LEVEL_ENV_NAME, DEFAULT_CODE_LEVEL))
+
+    @transformed_code_level.setter
+    def transformed_code_level(self, level):
+        self.check_level(level)
+        self._transformed_code_level = level
+
+    def check_level(self, level):
+        if isinstance(level, (six.integer_types, type(None))):
+            rv = level
+        else:
+            raise TypeError("Level is not an integer: {}".format(level))
+        return rv
+
+    def has_code_level(self, level):
+        level = self.check_level(level)
+        return level == self.transformed_code_level
+
+    def has_verbosity(self, level):
+        level = self.check_level(level)
+        return level >= self.verbosity_level
+
+    def error(self, msg, *args, **kwargs):
+        self.logger.error(msg, *args, **kwargs)
+
+    def warn(self, msg, *args, **kwargs):
+        self.logger.warn(msg, *args, **kwargs)
+
+    def log(self, level, msg, *args, **kwargs):
+        if self.has_verbosity(level):
+            self.logger.log(level, msg, *args, **kwargs)
+
+    def log_transformed_code(self, level, ast_node, transformer_name, *args,
+                             **kwargs):
+        if self.has_code_level(level):
+            source_code = ast_to_source_code(ast_node)
+            header_msg = "After the level {} ast transformer: '{}', the transformed code:\n"\
+                .format(level, transformer_name)
+
+            msg = header_msg + source_code
+            self.logger.info(msg, *args, **kwargs)
+
+
+_TRANSLATOR_LOGGER = TranslatorLogger()
+
+
+def set_verbosity(level=0):
+    """
+    Sets the verbosity level of log for dygraph to static graph.
+    There are two means to set the logging verbosity:
+     1. Call function `set_verbosity`
+     2. Set environment variable `TRANSLATOR_VERBOSITY`
+
+    **Note**:
+    `set_verbosity` has a higher priority than the environment variable.
+
+    Args:
+        level(int): The verbosity level. The larger value idicates more verbosity.
+            The default value is 0, which means no logging.
+
+    Examples:
+        .. code-block:: python
+
+            import os
+            import paddle
+
+            paddle.jit.set_verbosity(1)
+            # The verbosity level is now 1
+
+            os.environ['TRANSLATOR_VERBOSITY'] = '3'
+            # The verbosity level is now 3, but it has no effect because it has a lower priority than `set_verbosity`
+    """
+    _TRANSLATOR_LOGGER.verbosity_level = level
+
+
+def get_verbosity():
+    return _TRANSLATOR_LOGGER.verbosity_level
+
+
+LOG_AllTransformer = 100
+
+
+def set_code_level(level=LOG_AllTransformer):
+    """
+    Sets the level to print code from specific level of Ast Transformer.
+    There are two means to set the code level:
+     1. Call function `set_code_level`
+     2. Set environment variable `TRANSLATOR_CODE_LEVEL`
+
+    **Note**:
+    `set_code_level` has a higher priority than the environment variable.
+
+    Args:
+        level(int): The level to print code. Default is 100, which means to print the code after all AST Transformers.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.jit.set_code_level(2)
+            # It will print the transformed code at level 2, which means to print the code after second transformer,
+            # as the date of August 28, 2020, it is CastTransformer.
+
+            os.environ['TRANSLATOR_CODE_LEVEL'] = '3'
+            # The code level is now 3, but it has no effect because it has a lower priority than `set_code_level`
+
+    """
+    _TRANSLATOR_LOGGER.transformed_code_level = level
+
+
+def get_code_level():
+    return _TRANSLATOR_LOGGER.transformed_code_level
+
+
+def error(msg, *args, **kwargs):
+    _TRANSLATOR_LOGGER.error(msg, *args, **kwargs)
+
+
+def warn(msg, *args, **kwargs):
+    _TRANSLATOR_LOGGER.warn(msg, *args, **kwargs)
+
+
+def log(level, msg, *args, **kwargs):
+    _TRANSLATOR_LOGGER.log(level, msg, *args, **kwargs)
+
+
+def log_transformed_code(level, ast_node, transformer_name, *args, **kwargs):
+    _TRANSLATOR_LOGGER.log_transformed_code(level, ast_node, transformer_name,
+                                            *args, **kwargs)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index ceacba25375c64552e1e85d046ca494b078ee66d..698d989343a23015529a3b37b285640466d1c30d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -13,25 +13,23 @@
 # limitations under the License.
 
 from __future__ import print_function
-
+import gast
 import collections
+import logging
 import inspect
+import six
 import textwrap
 import threading
 import warnings
 
 import gast
-import numpy as np
-from paddle.fluid import core
-from paddle.fluid import executor
 from paddle.fluid import framework
-from paddle.fluid import scope_guard
-from paddle.fluid import unique_name
-from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph import layers
+from paddle.fluid.data_feeder import check_type
+from paddle.fluid.layers.utils import flatten
 from paddle.fluid.dygraph.base import param_guard
 from paddle.fluid.dygraph.base import switch_to_static_graph
-from paddle.fluid.dygraph.dygraph_to_static.ast_transformer import DygraphToStaticAst
+from paddle.fluid.dygraph.dygraph_to_static import DygraphToStaticAst
 from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
 from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info
@@ -41,13 +39,19 @@ from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_progr
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_func
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
+from paddle.fluid.dygraph.dygraph_to_static.utils import type_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap
-from paddle.fluid.layers.utils import flatten
-from paddle.fluid.layers.utils import pack_sequence_as
+from paddle.fluid.dygraph.dygraph_to_static.utils import make_hashable
+from paddle.fluid.dygraph.dygraph_to_static.function_spec import FunctionSpec
+from paddle.fluid.dygraph.dygraph_to_static.function_spec import get_buffers, get_parameters
 from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 
 __all__ = ['ProgramTranslator', 'convert_to_static']
 
+# For each traced function, we set `max_traced_program_count` = 10 to consider caching performance.
+# Once exceeding the threshold, we will raise warning to users to make sure the conversion is as expected.
+MAX_TRACED_PROGRAM_COUNT = 10
+
 
 class FunctionCache(object):
     """
@@ -136,100 +140,323 @@ def convert_to_static(function):
         return static_func
 
 
-class FunctionSpec(object):
-    def __init__(self, func, args, kwargs):
-        self._dyfunc = func
-        self._args = args
-        self._kwargs = kwargs
+class CacheKey(object):
+    """
+    Cached key for ProgramCache.
+    """
+
+    __slots__ = ['function_spec', 'input_with_spec', 'class_instance']
 
-        # TODO(liym27): func has multi layer decorator
-        dyfunc = getattr(func, '__wrapped__', func)
-        self._dyfunc_code = inspect.getsource(dyfunc)
+    def __init__(self, function_spec, input_with_spec, class_instance):
+        """
+        Initializes a cache key.
 
-    def is_method(self):
-        return self._args and isinstance(self._args[0], layers.Layer)
+        Args:
+            functions_spec(FunctionSpec): a FunctionSpec instance of decorated function.
+            input_with_spec(list[InputSpec]): actual inputs with some arguments replaced by InputSpec.
+            class_instance(object): a instance of class `Layer`.
+        """
+        self.function_spec = function_spec
+        self.input_with_spec = input_with_spec
+        self.class_instance = class_instance
 
-    def parameters(self, include_sublayer=True):
+    @classmethod
+    def from_func_and_args(cls, function_spec, args, kwargs, class_instance):
         """
-        Returns parameters of decorated layers. If set `include_sublayer` True,
-        the parameters created in sub layers will be added.
+        Generated a CacheKey instance by given inputs.
+
+        Args:
+            functions_spec(FunctionSpec): a FunctionSpec instance of decorated function.
+            args(tuple): tuple of actual inputs arguments.
+            kwargs(dict): dict of actual inputs keyword arguments.
+            class_instance(object): a instance of class `Layer`.
         """
-        params = collections.OrderedDict()
-        if self.is_method():
-            layer_instance = self._args[0]
-            if include_sublayer:
-                params = layer_instance.parameters()
-                names = [p.name for p in params]
-                params = collections.OrderedDict(zip(names, params))
+        # 1. filter `self` in args
+        if args and isinstance(args[0], layers.Layer):
+            args = args[1:]
+        # 2. convert tensor and numpy array into InputSpec 
+        _args, _kwargs = function_spec.unified_args_and_kwargs(args, kwargs)
+        input_with_spec = function_spec.args_to_input_spec(_args, _kwargs)
+
+        # 3. check whether hit the cache or build a new program for the input arguments
+        return CacheKey(function_spec, input_with_spec, class_instance)
+
+    def __hash__(self):
+        error_msg = "Arguments to a `@paddle.jit.to_static` must be a hashable Python objects (or nested structures of these types)."
+        return hash((id(self.function_spec),
+                     make_hashable(self.input_with_spec, error_msg),
+                     self.class_instance))
+
+    def __eq__(self, other):
+        return (type(self) is type(other)) and hash(self) == hash(other)
+
+    def __neq__(self, other):
+        return not self == other
+
+    def __repr__(self):
+        return "id(function_spec): {}, input_with_spec: {}, class_instance: {}".format(
+            id(self.function_spec), self.input_with_spec, self.class_instance)
+
+
+def unwrap_decorators(func):
+    """
+    Unwraps a decorated function and returns the decorator list and inner target.
+    """
+    decorators = []
+    cur = func
+    while True:
+        if isinstance(cur, StaticLayer):
+            decorators.append(cur)
+            # Note: if `cur` is a method, keep it as bound method of class.
+            instance = cur._class_instance
+            if instance is not None:
+                cur = cur.dygraph_function.__get__(instance)
             else:
-                params = layer_instance._parameters
-        return params
+                cur = cur.dygraph_function
+        else:
+            break
+    return decorators, cur
+
 
-    def buffers(self, include_sublayer=True):
+class StaticLayer(object):
+    """
+    Wrapper class to Manage program conversion of decorated function.
+
+    """
+
+    def __init__(self, function, input_spec=None):
+        """
+        Initializes a `StaticLayer`.
+
+        Args:
+            function(callable): A function or method that will be converted into static program.
+            input_spec(list[InputSpec]): list of InputSpec to specify the `shape/dtype/name` information for each input argument, default None.
+        """
+        # save the instance `self` while decorating a method of class.
+        if inspect.ismethod(function):
+            self._dygraph_function = getattr(function, '__func__')
+            self._class_instance = getattr(function, '__self__')
+        else:
+            self._dygraph_function = function
+            self._class_instance = None
+
+        self._input_spec = input_spec
+        self._function_spec = FunctionSpec(function, input_spec)
+        self._program_cache = ProgramCache()
+        # Note: Hold a reference to ProgramTranslator for switching `enable_declarative`.
+        self._program_trans = ProgramTranslator()
+
+    def __get__(self, instance, owner):
         """
-        Returns Variable buffers of decorated layers. If set `include_sublayer` True,
-        the Variable buffers created in sub layers will be added.
+        Overrides this method to parse the class instance and call bound method correctly.
+
+        For example:
+            
+            '''
+            class Net(Layer):
+                def __init__(self):
+                    pass
+                
+                @paddle.jit.to_static
+                def forward(self, x, y):
+                    return x + y
+
+            net = Net()
+            out = net(x, y)
+            '''
+        
+        In above case, `net(x, y)` will call `net.forward(x, y)` firstly that is a bound method
+        of `Net` instance. After decorated by `@paddle.jit.to_static`, it will firstly to call `__get__`
+        to parse the class instance correctly instead of the `StaticLayer` instance.
+        """
+        self._class_instance = instance
+        return self
+
+    def __call__(self, *args, **kwargs):
         """
-        buffers = collections.OrderedDict()
-        if self.is_method():
-            layer_instance = self._args[0]
-            if include_sublayer:
-                buffers = layer_instance.buffers()
-                names = [buffer.name for buffer in buffers]
-                buffers = collections.OrderedDict(zip(names, buffers))
+        Supports to call the returned instance with input `args` and `kwargs` directly.
+
+        Args:
+            *args(tuple): tuple of all input arguments from original decorated function.
+            **kwargs(dict): dict of all input keyward arguments from original decorated function. 
+
+        Return:
+            Outputs of decorated function.
+        """
+        # 1. call dygraph function directly if not enable `declarative`
+        if not self._program_trans.enable_declarative:
+            warnings.warn(
+                "The decorator '@paddle.jit.to_static' doesn't work when setting ProgramTranslator.enable=False. "
+                "We will just return dygraph output.")
+            return self._call_dygraph_function(*args, **kwargs)
+
+        # 2. trace ops from dygraph layers and cache the generated program.
+        args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs)
+        try:
+            concrete_program, partial_program_layer = self.get_concrete_program(
+                *args, **kwargs)
+
+            # 3. synchronize self.training attribute.
+            if isinstance(self._class_instance, layers.Layer):
+                partial_program_layer.training = self._class_instance.training
+
+            # 4. return outputs.
+            return partial_program_layer(args)
+        except Exception as e:
+            if not hasattr(e, ERROR_DATA):
+                # runtime error
+                attach_error_data(e, in_runtime=True)
+            error_data = getattr(e, ERROR_DATA, None)
+            if error_data:
+                new_exception = error_data.create_exception()
+                if six.PY3:
+                    # NOTE(liym27):
+                    # 1. Why `raise new_exception from None`?
+                    #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
+                    #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
+                    #   caught exception.
+                    # 2. Use exec to bypass syntax error checking in Python 2.
+
+                    six.exec_("raise new_exception from None")
+                else:
+                    raise new_exception
             else:
-                buffers = layer_instance._buffers
-        return buffers
+                raise
 
-    @switch_to_static_graph
-    def to_static_inputs(self, main_program):
-        inputs = []
-        block = main_program.global_block()
-        for input_var in flatten(self.args):
-            if isinstance(input_var, np.ndarray):
-                feed_layer = block.create_var(
-                    name=unique_name.generate('feed'),
-                    shape=list(input_var.shape),
-                    dtype=input_var.dtype,
-                    is_data=True,
-                    need_check_feed=False)
-            elif isinstance(input_var, core.VarBase):
-                feed_layer = block.create_var(
-                    name=input_var.name,
-                    shape=list(input_var.shape),
-                    dtype=input_var.dtype,
-                    stop_gradient=input_var.stop_gradient,
-                    need_check_feed=False)
+    def _call_dygraph_function(self, *args, **kwargs):
+        """
+        Calls dygraph function directly and returns the outputs.
+
+        Args:
+            *args(tuple): tuple of all input arguments from original decorated function.
+            **kwargs(dict): dict of all input keyward arguments from original decorated function. 
+
+        Return:
+            Outputs of dygraph function.
+        """
+        if self._class_instance is not None:
+            dygraph_function = self._dygraph_function.__get__(
+                self._class_instance)
+        else:
+            dygraph_function = self._dygraph_function
+
+        return dygraph_function(*args, **kwargs)
+
+    def get_concrete_program(self, *args, **kwargs):
+        """
+        Returns traced concrete program and inner executable partial layer.
+
+        Args:
+            *args(tuple): input arguments values or InputSpec
+            **kwargs(dict) : input kwargs values.
+
+        Returns:
+            Traced ConcreteProgram and executable translated Layer.
+        """
+        # 1. unify args/kwargs and replace Tensor with InputSpec
+        if len(args) != len(self._function_spec.args_name):
+            args, kwargs = self._function_spec.unified_args_and_kwargs(args,
+                                                                       kwargs)
+        input_with_spec = self._function_spec.args_to_input_spec(args, kwargs)
+
+        # 2. generate cache key
+        cache_key = CacheKey(self._function_spec, input_with_spec,
+                             self._class_instance)
+
+        # 3. check whether hit the cache or build a new program for the input arguments
+        concrete_program, partial_program_layer = self._program_cache[cache_key]
+        return concrete_program, partial_program_layer
+
+    def get_traced_count(self):
+        """
+        Returns the number of traced programs for the decorated function.
+        """
+        return len(self._program_cache)
+
+    @property
+    def code(self):
+        """
+        Returns the source code of transformed static function for debugging.
+        """
+        static_func = convert_to_static(self._dygraph_function)
+        source_code = func_to_source_code(static_func)
+        return source_code
+
+    @property
+    def dygraph_function(self):
+        """
+        Returns the original decorated function.
+        """
+        return self._dygraph_function
+
+    @property
+    def concrete_program(self):
+        """
+        Returns recent ConcreteProgram instance of decorated function.
+        """
+        # if specific the `input_spec`, the length of program_cache will always 1,
+        # else, return the last one.
+        cached_program_len = len(self._program_cache)
+        # If specific `input_spec`, apply convertion from dygraph layers into static Program.
+        if cached_program_len == 0:
+            if len(self._function_spec.flat_input_spec) > 0:
+                input_spec = self._function_spec.input_spec
+                concrete_program, _ = self.get_concrete_program(*input_spec)
+                return concrete_program
             else:
-                feed_layer = input_var
+                raise ValueError("No valid transformed program for {}".format(
+                    self._function_spec))
+        # If more than one programs have been cached, return the recent converted program by default.
+        elif cached_program_len > 1:
+            logging.warning(
+                "Current {} has more than one cached programs: {}, the last traced progam will be return by default.".
+                format(self._function_spec, cached_program_len))
+
+        cache_key, (concrete_program,
+                    partial_layer) = self._program_cache.last()
+        return concrete_program
 
-            inputs.append(feed_layer)
-        # Restores the nested structure as self.args
-        return pack_sequence_as(self.args, inputs)
+    @property
+    def inputs(self):
+        """
+        Returns input tensors of recent converted static program.
+        """
+        concrete_program = self.concrete_program
+        inputs = [
+            var for var in flatten(concrete_program.inputs)
+            if isinstance(var, framework.Variable)
+        ]
+        return inputs
 
     @property
-    def dyfunc(self):
-        return self._dyfunc
+    def outputs(self):
+        """
+        Returns output tensors of recent converted static program.
+        """
+        concrete_program = self.concrete_program
+        outputs = [
+            var for var in flatten(concrete_program.outputs)
+            if isinstance(var, framework.Variable)
+        ]
+
+        return outputs
 
     @property
-    def args(self):
-        return self._args
-
-    def __key(self):
-        # Note: if dygraph function is a method of class,
-        # consider instance info as hash key.
-        if self.is_method():
-            # NOTE: we can use Layer's (instance + function code) as hash key.
-            # An instance will not hold two identical methods 
-            return self._dyfunc_code, self._args[0]
-        else:
-            return self._dyfunc
+    def main_program(self):
+        """
+        Returns recent converted static main program.
+        """
+        concrete_program = self.concrete_program
+        main_program = concrete_program.main_program
+        return main_program
 
-    def __hash__(self):
-        return hash(self.__key())
+    @property
+    def program_cache(self):
+        return self._program_cache
 
-    def __eq__(self, other):
-        return self.__key() == self.__key()
+    @property
+    def function_spec(self):
+        return self._function_spec
 
 
 # Flag that indicates whether running code under `@declarative`
@@ -255,11 +482,17 @@ def _switch_declarative_mode_guard_(is_declarative=True):
 
 
 class ConcreteProgram(object):
+
+    __slots__ = [
+        'inputs', 'outputs', 'main_program', "startup_program", "parameters",
+        "function"
+    ]
+
     def __init__(self,
                  inputs,
                  outputs,
                  parameters,
-                 func,
+                 function,
                  main_program,
                  startup_program=None):
         self.inputs = inputs
@@ -267,17 +500,21 @@ class ConcreteProgram(object):
         self.main_program = main_program
         self.startup_program = startup_program
         self.parameters = parameters
-        self.func_spec = func
+        self.function = function
 
     @staticmethod
     @switch_to_static_graph
-    def from_func_spec(func_spec):
+    def from_func_spec(func_spec, input_spec, class_instance):
         """
         Builds the main_program with specialized inputs and returns outputs
         of program as fetch_list.
+
+        Args:
+            func_spec(FunctionSpec): A FunctionSpec instance for decorated function.
+            input_spec(list[InputSpec]): 
         """
         # Transforms dygraph function into static function and caches it.
-        dygraph_function = func_spec.dyfunc
+        dygraph_function = func_spec.dygraph_function
         static_func = convert_to_static(dygraph_function)
 
         main_program, startup_program = framework.Program(), framework.Program()
@@ -291,15 +528,20 @@ class ConcreteProgram(object):
         with framework.program_guard(main_program, startup_program):
             with _switch_declarative_mode_guard_(is_declarative=True):
                 # 1. Adds `fluid.data` layers for input if needed
-                inputs = func_spec.to_static_inputs(main_program)
+                inputs = func_spec.to_static_inputs_with_spec(input_spec,
+                                                              main_program)
+                if class_instance:
+                    inputs = tuple([class_instance] + list(inputs))
 
                 # 2. Gets all ParamBases and buffered VarBases in the function
-                all_parameters_and_buffers = list(func_spec.parameters().values(
-                )) + list(func_spec.buffers().values())
+                all_parameters_and_buffers = list(
+                    get_parameters(class_instance).values()) + list(
+                        get_buffers(class_instance).values())
 
                 # 3. Builds program only once and returns the output Variables.
-                with param_guard(func_spec.parameters(False)), param_guard(
-                        func_spec.buffers(False)):
+                with param_guard(get_parameters(
+                        class_instance, False)), param_guard(
+                            get_buffers(class_instance, False)):
                     try:
                         outputs = static_func(*inputs)
                     except BaseException as e:
@@ -317,7 +559,7 @@ class ConcreteProgram(object):
             inputs=inputs,
             outputs=outputs,
             parameters=all_parameters_and_buffers,
-            func=dygraph_function,
+            function=dygraph_function,
             main_program=main_program,
             startup_program=startup_program)
 
@@ -330,27 +572,38 @@ class ProgramCache(object):
     def __init__(self):
         self._caches = collections.OrderedDict()
 
-    def _build_once(self, func_spec):
-        concrete_program = ConcreteProgram.from_func_spec(func_spec)
+    def _build_once(self, cache_key):
+        concrete_program = ConcreteProgram.from_func_spec(
+            func_spec=cache_key.function_spec,
+            input_spec=cache_key.input_with_spec,
+            class_instance=cache_key.class_instance)
         return concrete_program, partial_program_from(concrete_program)
 
     def __getitem__(self, item):
-        if not isinstance(item, FunctionSpec):
-            raise ValueError(
-                'type(item) should be FunctionSpec, but received %s' %
-                type(item))
+        if not isinstance(item, CacheKey):
+            raise ValueError('type(item) should be CacheKey, but received %s' %
+                             type_name(item))
+
         if item not in self._caches:
             self._caches[item] = self._build_once(item)
+            # Note: raise warnings if number of traced program is more than `max_tracing_count`
+            current_tracing_count = len(self._caches)
+            if current_tracing_count > MAX_TRACED_PROGRAM_COUNT:
+                logging.warning(
+                    "Current traced program number: {} > `max_tracing_count`:{}. Too much cached programs will bring expensive overhead. "
+                    "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors.".
+                    format(current_tracing_count, MAX_TRACED_PROGRAM_COUNT))
+
         return self._caches[item]
 
     def get_program(self, item):
-        if not isinstance(item, FunctionSpec):
+        if not isinstance(item, CacheKey):
             raise ValueError(
                 "Input item's type should be FunctionSpec, but received %s" %
-                type(item))
+                type_name(item))
         if item not in self._caches:
             raise RuntimeError(
-                "Failed to find program for input item, please decorate input function by `@declarative`."
+                "Failed to find program for input item, please decorate input function by `@paddle.jit.to_static`."
             )
         return self._caches[item]
 
@@ -360,6 +613,12 @@ class ProgramCache(object):
         key = next(reversed(self._caches.keys()))
         return key, self._caches[key]
 
+    def __len__(self):
+        return len(self._caches)
+
+    def concrete_programs(self):
+        return [cp for key, (cp, _) in self._caches.iteritems()]
+
 
 def synchronized(func):
     func.__lock__ = threading.Lock()
@@ -508,9 +767,11 @@ class ProgramTranslator(object):
                 "We will just return dygraph output.")
             return dygraph_func(*args, **kwargs)
 
-        function_spec = FunctionSpec(dygraph_func, args, kwargs)
-        concrete_program, partial_program_layer = self._program_cache[
-            function_spec]
+        function_spec = FunctionSpec(dygraph_func)
+        cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
+                                                getattr(dygraph_func,
+                                                        '__self__', None))
+        _, partial_program_layer = self._program_cache[cache_key]
 
         if args and isinstance(args[0], layers.Layer):
             # Synchronize self.training attribute.
@@ -624,8 +885,12 @@ class ProgramTranslator(object):
                 "We will just return dygraph output.")
             return dygraph_func(*args, **kwargs)
 
-        func_spec = FunctionSpec(dygraph_func, args, kwargs)
-        concrete_program, _ = self._program_cache[func_spec]
+        function_spec = FunctionSpec(dygraph_func)
+        cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
+                                                getattr(dygraph_func,
+                                                        '__self__', None))
+        concrete_program, partial_program_layer = self._program_cache[cache_key]
+
         # Note: concrete_program hold all input/output infos include non-Variable
         input_vars = [
             var for var in concrete_program.inputs
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 6636bf7c4b40514c972541b35f5f7f5ff79058ee..ba02a983f8e641079d8a60b166a6f098e6f725a8 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -18,16 +18,23 @@ import ast
 import astor
 import atexit
 import copy
+import collections
 import gast
-import imp
 import inspect
 import os
 import six
 import tempfile
 import textwrap
+import numpy as np
 
 from paddle.fluid import unique_name
 
+# imp is deprecated in python3
+if six.PY2:
+    import imp
+else:
+    from importlib.machinery import SourceFileLoader
+
 dygraph_class_to_static_api = {
     "CosineDecay": "cosine_decay",
     "ExponentialDecay": "exponential_decay",
@@ -41,6 +48,77 @@ dygraph_class_to_static_api = {
 FOR_ITER_INDEX_PREFIX = '__for_loop_var_index'
 FOR_ITER_VAR_LEN_PREFIX = '__for_loop_var_len'
 
+# FullArgSpec is valid from Python3. Defined a Namedtuple to
+# to make it available in Python2.
+FullArgSpec = collections.namedtuple('FullArgSpec', [
+    'args', 'varargs', 'varkw', 'defaults', 'kwonlyargs', 'kwonlydefaults',
+    'annotations'
+])
+
+
+def getfullargspec(target):
+    if hasattr(inspect, "getfullargspec"):
+        return inspect.getfullargspec(target)
+    else:
+        argspec = inspect.getargspec(target)
+        return FullArgSpec(
+            args=argspec.args,
+            varargs=argspec.varargs,
+            varkw=argspec.keywords,
+            defaults=argspec.defaults,
+            kwonlyargs=[],
+            kwonlydefaults=None,
+            annotations={})
+
+
+def parse_arg_and_kwargs(function):
+    """
+    Returns full argument names as list. e.g ['x', 'y', 'z']
+    """
+    fullargspec = getfullargspec(function)
+    arg_names = fullargspec.args
+    if arg_names and 'self' == arg_names[0]:
+        arg_names = fullargspec.args[1:]
+
+    # parse default kwargs
+    default_kwargs = {}
+    default_values = fullargspec.defaults
+    if default_values:
+        assert len(default_values) <= len(arg_names)
+        default_kwarg_names = arg_names[-len(default_values):]
+        default_kwargs = dict(zip(default_kwarg_names, default_values))
+
+    return arg_names, default_kwargs
+
+
+def type_name(v):
+    return type(v).__name__
+
+
+def make_hashable(x, error_msg=None):
+    """
+    Makes input `x` hashable.
+
+    For some unhashable objects, such as `dict/list/np.ndarray`,applying hash function by using their values.
+    """
+    if isinstance(x, (tuple, list)):
+        return tuple(map(make_hashable, x))
+
+    try:
+        hash(x)
+    except TypeError:
+        if isinstance(x, np.ndarray):
+            # Note: `tostring()` will return the binary data from np.ndarray that
+            # means different value will lead to different hash code.
+            return hash(x.tostring())
+        elif isinstance(x, dict):
+            return tuple(map(make_hashable, x.values()))
+
+        error_msg = error_msg or "Requires a hashable object."
+        raise ValueError(error_msg + " But received type: %s" % type_name(x))
+
+    return x
+
 
 def _is_api_in_module_helper(obj, module_prefix):
     m = inspect.getmodule(obj)
@@ -391,7 +469,10 @@ def ast_to_func(ast_root, dyfunc, delete_on_exit=True):
         atexit.register(lambda: remove_if_exit(f.name))
         atexit.register(lambda: remove_if_exit(f.name[:-3] + ".pyc"))
 
-    module = imp.load_source(module_name, f.name)
+    if six.PY2:
+        module = imp.load_source(module_name, f.name)
+    else:
+        module = SourceFileLoader(module_name, f.name).load_module()
     func_name = dyfunc.__name__
     if not hasattr(module, func_name):
         raise ValueError(
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 0da5c57f1bc92f680335872c68ddb96079757614..7f3d450a49c7d3fcc9ca1d3c2d7c5eb732671c6c 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -378,7 +378,7 @@ def _load_persistable_vars_by_program(model_path,
             new_var = framework._varbase_creator(
                 type=each_var.type(),
                 name=each_var.name(),
-                shpae=each_var.shape(),
+                shape=each_var.shape(),
                 dtype=each_var.dtype(),
                 persistable=True)
         if params_filename is None:
@@ -437,8 +437,16 @@ def _load_persistable_vars(model_path,
         value: key
         for key, value in program_holder._suffix_varname_dict.items()
     }
-    # NOTE: some var may not be Parameter
-    for name in sorted(extra_var_info):
+
+    # NOTE(chenweihang): we need load persistable vars based the program,
+    # because the program may be pruned when `save_inference_model`, some
+    # var in `extra_var_info` may have been pruned 
+    for name in sorted(inv_suffix_varname_dict):
+        if name not in extra_var_info:
+            raise RuntimeError(
+                "The model to be loaded is not complete."
+                "The variable `%s` of program cannot be found in loaded model.",
+                name)
         # get suffix var name, see [why need to append suffix to persistable vars]
         new_name = inv_suffix_varname_dict[name]
         # create output varbase
@@ -628,7 +636,7 @@ class TranslatedLayer(layers.Layer):
             )
         if not isinstance(persistable_vars, dict):
             raise TypeError(
-                "TranslatedLayer need to use persisatbale variable dict for initialization."
+                "TranslatedLayer need to use persistable variable dict for initialization."
             )
 
         self._program_holder_dict = programs
@@ -641,19 +649,21 @@ class TranslatedLayer(layers.Layer):
         # name contains `.` originally, such as `linear_0.w_0`, so here
         # need to generate new var name for each var
         self._persistable_var_name_dict = dict()
-        for name, var in persistable_vars.items():
-            if isinstance(var, framework.ParamBase):
-                dy_name = _generate_unique_var_name(PARAMETER_NAME_PREFIX)
-                self._persistable_var_name_dict[name] = dy_name
-                self.add_parameter(dy_name, var)
-            elif isinstance(var, core.VarBase):
-                dy_name = _generate_unique_var_name(BUFFER_NAME_PREFIX)
-                self._persistable_var_name_dict[name] = dy_name
-                self.register_buffer(dy_name, var)
-            else:
-                raise TypeError(
-                    "Adding persistent variable which  to layer is not supported now"
-                )
+        # the TranslatedLayer object holded var names count started from 0
+        with unique_name.guard():
+            for name, var in persistable_vars.items():
+                if isinstance(var, framework.ParamBase):
+                    dy_name = _generate_unique_var_name(PARAMETER_NAME_PREFIX)
+                    self._persistable_var_name_dict[name] = dy_name
+                    self.add_parameter(dy_name, var)
+                elif isinstance(var, core.VarBase):
+                    dy_name = _generate_unique_var_name(BUFFER_NAME_PREFIX)
+                    self._persistable_var_name_dict[name] = dy_name
+                    self.register_buffer(dy_name, var)
+                else:
+                    raise TypeError(
+                        "Adding persistent variable which  to layer is not supported now"
+                    )
 
         self._is_test = True
 
@@ -675,7 +685,7 @@ class TranslatedLayer(layers.Layer):
         # 1. load program desc & construct _ProgramHolder
         programs = _construct_program_holders(model_path, model_filename)
 
-        # 2. load layer parameters & parameter attirbutes
+        # 2. load layer parameters & parameter attributes
         persistable_vars = _construct_params_and_buffers(
             model_path, programs, separate_params, params_filename)
 
@@ -743,7 +753,7 @@ class TranslatedLayer(layers.Layer):
                                          core.VarDesc.VarType.STEP_SCOPES, True)
             tmp_scope_vec.value().set_scope(program_holder.scope)
 
-            # 2. run prorgam by op
+            # 2. run program by op
             trace_program = program_holder.infer_program if self._is_test else program_holder.train_program
             end_op_index = program_holder.infer_program.block(0).op_size()
             framework._dygraph_tracer().trace_op(
@@ -764,7 +774,7 @@ class TranslatedLayer(layers.Layer):
             # will be SelectedRows, not LoDTensor. But tracer will just
             # set param grad VarBase by forward VarBase(LoDTensor)
             # If we don't change grad_var type here, RunProgramOp need
-            # transform SelectedRows to LoDTensor forcely, it may not
+            # transform SelectedRows to LoDTensor forcibly, it may not
             # be user wanted result.
             for persistable_var in persistable_vars:
                 grad_var_name = var.name + core.grad_var_suffix()
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 337d2dfc008e8298925bb3febdc2007736e4a1b6..853c16a5d0f7129f097f7fca860ab260f9dc7fd5 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -19,12 +19,13 @@ import pickle
 import warnings
 
 import six
+import paddle
 from paddle.fluid import core
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
-from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import FunctionSpec, ProgramTranslator
+from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticLayer, unwrap_decorators
 from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
 from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.executor import Executor, scope_guard
@@ -33,7 +34,10 @@ from paddle.fluid.framework import _current_expected_place, _dygraph_guard, _dyg
 from paddle.fluid.framework import dygraph_only, in_dygraph_mode
 from paddle.fluid.wrapped_decorator import wrap_decorator
 
-__all__ = ['TracedLayer', 'declarative', 'dygraph_to_static_func']
+__all__ = [
+    'TracedLayer', 'declarative', 'dygraph_to_static_func', 'set_code_level',
+    'set_verbosity'
+]
 
 
 def create_program_from_desc(program_desc):
@@ -128,17 +132,42 @@ def _dygraph_to_static_func_(dygraph_func):
 dygraph_to_static_func = wrap_decorator(_dygraph_to_static_func_)
 
 
-def _declarative_(dygraph_func):
+def copy_decorator_attrs(original_func, decorated_obj):
+    """
+    Copies some necessary attributes from original function into decorated function.
+
+    Args:
+        original_func(callable): the original decorated function.
+        decorated_obj(StaticLayer): the target decorated StaticLayer object.
+    """
+    decorator_name = "declarative"
+
+    decorated_obj.__name__ = original_func.__name__
+    decorated_obj._decorator_name = decorator_name
+    decorated_obj.__wrapped__ = original_func
+    decorated_obj.__doc__ = original_func.__doc__
+    if hasattr(original_func, "__module__"):
+        decorated_obj.__module__ = original_func.__module__
+
+    return decorated_obj
+
+
+def declarative(function=None, input_spec=None):
     """
     Converts imperative dygraph APIs into declarative function APIs. Decorator
     @declarative handles the Program and Executor of static mode and returns
-    the result as a dygraph VarBase.
+    the result as dygraph Tensor(s). Users could use the returned dygraph
+    Tensor(s) to do imperative training, inference, or other operations. If the
+    decorated function calls other imperative function, the called one will be
+    converted into declarative function as well.
 
     Args:
-        dygraph_func (callable): callable imperative function.
+        function (callable): callable imperative function.
+        input_spec(list[InputSpec]): list of InputSpec to specific the shape/dtype/name
+            information of each input Tensor.
 
     Returns:
-        VarBase: containing the numerical result.
+        Tensor(s): containing the numerical result.
 
     Examples:
         .. code-block:: python
@@ -147,6 +176,7 @@ def _declarative_(dygraph_func):
           import numpy as np
           from paddle.fluid.dygraph.jit import declarative
 
+          fluid.enable_dygraph()
 
           @declarative
           def func(x):
@@ -163,37 +193,27 @@ def _declarative_(dygraph_func):
 
     """
 
-    def __impl__(*args, **kwargs):
-        program_translator = ProgramTranslator()
-        if not program_translator.enable_declarative:
-            warnings.warn(
-                "The decorator 'declarative' doesn't work when setting ProgramTranslator.enable=False. "
-                "We will just return dygraph output.")
-            return dygraph_func(*args, **kwargs)
-        try:
-            return program_translator.get_output(dygraph_func, *args, **kwargs)
-        except Exception as e:
-            error_data = getattr(e, ERROR_DATA, None)
-            if error_data:
-                new_exception = error_data.create_exception()
-                if six.PY3:
-                    # NOTE(liym27):
-                    # 1. Why `raise new_exception from None`?
-                    #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
-                    #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
-                    #   caught exception.
-                    # 2. Use exec to bypass syntax error checking in Python 2.
-
-                    six.exec_("raise new_exception from None")
-                else:
-                    raise new_exception
-            else:
-                raise
+    def decorated(python_func):
+        """
+        Decorates a python function into a StaticLayer object.
+        """
+        # Step 1. unwrap the function if it is already decorated.
+        _, python_func = unwrap_decorators(python_func)
 
-    return __impl__
+        # Step 2. copy some attributes from original python function.
+        static_layer = copy_decorator_attrs(
+            original_func=python_func,
+            decorated_obj=StaticLayer(
+                function=python_func, input_spec=input_spec))
+
+        return static_layer
 
+    # for usage: `declarative(foo, ...)`
+    if function is not None:
+        return decorated(function)
 
-declarative = wrap_decorator(_declarative_)
+    # for usage: `@declarative`
+    return decorated
 
 
 class SaveLoadConfig(object):
@@ -335,7 +355,7 @@ class SaveLoadConfig(object):
                 # use SaveLoadconfig.output_spec
                 model_path = "simplenet.example.model.output_spec"
                 configs = fluid.dygraph.jit.SaveLoadConfig()
-                # only keep the predicted output in saved model, diccard loss
+                # only keep the predicted output in saved model, discard loss
                 configs.output_spec = [out]
 
                 fluid.dygraph.jit.save(
@@ -370,7 +390,7 @@ class SaveLoadConfig(object):
         The name of file to save the translated program of target Layer.
         Default filename is :code:`__model__` .
 
-        Exampels:
+        Examples:
             .. code-block:: python
 
                 import numpy as np
@@ -440,7 +460,7 @@ class SaveLoadConfig(object):
         The name of file to save all persistable variables in target Layer. 
         Default file name is :code:`__variables__` .
         
-        Exampels:
+        Examples:
             .. code-block:: python
 
                 import numpy as np
@@ -593,7 +613,7 @@ def save(layer, model_path, input_spec=None, configs=None):
     The default saved translated program file name is ``__model__``,
     and the default saved persistable variables file name is ``__variables__``,
     and it also saved some additional variable description information to file 
-    ``__varibales.info__``, these additional information is used in fine-tuning.
+    ``__variables.info__``, these additional information is used in fine-tuning.
 
     The saved model can be loaded by follow APIs:
       - :ref:`api_imperative_jit_load`
@@ -603,7 +623,7 @@ def save(layer, model_path, input_spec=None, configs=None):
     Args:
         layer (Layer): the Layer to be saved. The Layer should be decorated by `@declarative`.
         model_path (str): the directory to save the model.
-        input_spec (list[Varibale], optional): Describes the input of the saved model. 
+        input_spec (list[Variable], optional): Describes the input of the saved model. 
             It is the example inputs that will be passed to saved TranslatedLayer's forward
             function. If None, all input variables of the original Layer's forward function
             would be the inputs of the saved model. Default None.
@@ -717,16 +737,17 @@ def save(layer, model_path, input_spec=None, configs=None):
                 "The input input_spec should be 'list', but received input_spec's type is %s."
                 % type(input_spec))
         for var in input_spec:
-            if not isinstance(var, core.VarBase):
+            if not isinstance(var, (core.VarBase, Variable,
+                                    paddle.static.InputSpec)):
                 raise TypeError(
-                    "The element in input_spec list should be 'Variable', but received element's type is %s."
+                    "The element in input_spec list should be 'Variable' or `paddle.static.InputSpec`, but received element's type is %s."
                     % type(var))
 
     # 2. get program of declarative Layer.forward
-    prog_cache = prog_translator.get_program_cache()
-    # make dummy args & kwargs, to get excepted FunctionSpec
-    layer_func = FunctionSpec(type(layer).forward, [layer], {})
-    concrete_program, _ = prog_cache.get_program(layer_func)
+    if not isinstance(layer.forward, StaticLayer):
+        raise RuntimeError(
+            "layer.forward need to be decorated by `@declarative`.")
+    concrete_program = layer.forward.concrete_program
 
     # NOTE: we maintain the mapping of variable name to
     # structured name, the buffer variable (non-persistable)
@@ -810,7 +831,7 @@ def load(model_path, configs=None):
         For some historical reasons, if you load model saved by :ref:`api_fluid_io_save_inference_model`,
         there will be the following limitations when using it in fine-tuning:
         1. Imperative mode do not support LoDTensor. All original model's feed targets or parametars that depend on LoD are temporarily unavailable.
-        2. All saved model's feed targets need to be passed into TranslatedLayer's forwrad function.
+        2. All saved model's feed targets need to be passed into TranslatedLayer's forward function.
         3. The variable's ``stop_gradient`` information is lost and can not be recovered.
         4. The parameter's ``trainable`` information is lost and can not be recovered.
 
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 7b2434e1a213a9594b2182f5dbd85feb20a6a78e..1ef719b9da187be659d9c898ec996b5ad0c0d8a6 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -553,7 +553,10 @@ class Layer(core.Layer):
                 "The name of buffer should be a string, but received {}.".
                 format(type(name).__name__))
         elif '.' in name:
-            raise KeyError("The name of buffer can not contain \".\"")
+            raise KeyError(
+                "The name of buffer can not contain `.`, "
+                "because when you access the newly added buffer in the "
+                "form of `self.**.**`, it will cause AttributeError.")
         elif name == '':
             raise KeyError("The name of buffer can not be empty.")
         elif hasattr(self, name) and name not in self._buffers:
@@ -736,20 +739,38 @@ class Layer(core.Layer):
         Returns:
             Parameter: the parameter passed in.
         """
-        if parameter is None:
-            self._parameters[name] = None
-        elif not isinstance(parameter, framework.Parameter):
+        if '_parameters' not in self.__dict__:
+            raise RuntimeError(
+                "super(YourLayer, self).__init__() should be called firstly.")
+        elif not isinstance(name, six.string_types):
             raise TypeError(
-                "parameter assignment requires Parameter or None, but got '{}'"
-                .format(type(parameter).__name__))
+                "The name of parameter should be a string, but received {}.".
+                format(type(name).__name__))
+        elif '.' in name:
+            raise KeyError(
+                "The name of parameter can not contain `.`, "
+                "because when you access the newly added parameter in the "
+                "form of `self.**.**`, it will cause AttributeError.")
+        elif name == '':
+            raise KeyError("The name of parameter can not be empty.")
+        elif hasattr(self, name) and name not in self._parameters:
+            raise KeyError("The parameter '{}' already exists.".format(name))
+        elif parameter is not None and not isinstance(parameter,
+                                                      framework.Parameter):
+            raise TypeError(
+                "The parameter to be added should be a Parameter, but received {}.".
+                format(type(parameter).__name__))
+        else:
+            if parameter is None:
+                self._parameters[name] = None
 
-        if len(self._loaddict_holder) > 0:
-            assert parameter.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in stat_dict".format(
-                parameter.name)
+            if len(self._loaddict_holder) > 0:
+                assert parameter.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
+                    parameter.name)
 
-            parameter.set_value(self._loaddict_holder[parameter.name])
+                parameter.set_value(self._loaddict_holder[parameter.name])
 
-        self._parameters[name] = parameter
+            self._parameters[name] = parameter
         return parameter
 
     def __getattr__(self, name):
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 4ee5e9895a7e145fc079bc9d678f34fb4d964a2e..bb55c6725e6a62f2cef393fd34b249c217be0c54 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -17,7 +17,9 @@ from __future__ import print_function
 from .. import core
 from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
 from ..layers.layer_function_generator import OpProtoHolder
+from ..layers import common_methods
 from . import to_variable, no_grad
+import paddle
 
 import numpy as np
 import six
@@ -30,6 +32,8 @@ _supported_int_dtype_ = [
     core.VarDesc.VarType.INT64,
 ]
 
+_already_patch_varbase = False
+
 
 def monkey_patch_math_varbase():
     """
@@ -140,25 +144,50 @@ def monkey_patch_math_varbase():
         else:
             return int(var.numpy().flatten()[0])
 
-    def _scalar_elementwise_add_(var, value):
+    @property
+    def _ndim_(var):
+        return len(var.shape)
+
+    def _scalar_add_(var, value):
         return _scalar_elementwise_op_(var, 1.0, value)
 
-    def _scalar_elementwise_sub_(var, value):
+    def _scalar_sub_(var, value):
         return _scalar_elementwise_op_(var, 1.0, -value)
 
-    def _scalar_elementwise_rsub_(var, value):
+    def _scalar_rsub_(var, value):
         return _scalar_elementwise_op_(var, -1.0, value)
 
-    def _scalar_elementwise_mul_(var, value):
+    def _scalar_mul_(var, value):
         return _scalar_elementwise_op_(var, value, 0.0)
 
-    def _scalar_elementwise_div_(var, value):
+    def _scalar_div_(var, value):
         return _scalar_elementwise_op_(var, 1.0 / value, 0.0)
 
-    def _elemwise_method_creator_(method_name,
-                                  op_type,
-                                  reverse=False,
-                                  scalar_method=None):
+    # TODO(shenliang03):  currently, it supports divide, floor_divide, remainder
+    # for binary operator by using the api to achieve the type promotion
+    def _binary_method_creator_(op_type, reverse=False):
+        import paddle
+
+        def __impl__(self, other_var):
+            import paddle
+            op = getattr(paddle, op_type)
+            if reverse:
+                return op(other_var, self)
+            else:
+                return op(self, other_var)
+
+        __impl__.__doc__ = """
+
+        See paddle.{}""".format(op_type)
+        __impl__.__name__ = op_type
+
+        return __impl__
+
+    # for binary operator such as elementwise, compare
+    def _binary_creator_(method_name,
+                         op_type,
+                         reverse=False,
+                         scalar_method=None):
         def __impl__(self, other_var):
             # FIXME(zjl): elementwise_div between integers cannot be converted to scale,
             # which may lose accuracy. This is a hot fix for release 1.6.
@@ -200,60 +229,117 @@ def monkey_patch_math_varbase():
         __impl__.__doc__ = """
         {0}
         Args:
-            self(Variable): left hand variable
-            other_var(Variable|float|int): right hand variable
+            self(Tensor): left hand Tensor
+            other_var(Tensor|float|int): right hand Tensor
 
         Returns:
-            Variable
+            Tensor
         """.format(comment)
         __impl__.__name__ = method_name
         return __impl__
 
-    # inject methods
-    for method_name, op_type, reverse, scalar_method in (
-        ("__add__", "elementwise_add", False, _scalar_elementwise_add_),
-            # a+b == b+a. Do not need to reverse explicitly
-        ("__radd__", "elementwise_add", False, _scalar_elementwise_add_),
-        ("__sub__", "elementwise_sub", False, _scalar_elementwise_sub_),
-        ("__rsub__", "elementwise_sub", True, _scalar_elementwise_rsub_),
-        ("__mul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-            # a*b == b*a. Do not need to reverse explicitly
-        ("__rmul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-        ("__div__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__truediv__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__rdiv__", "elementwise_div", True, None),
-        ("__rtruediv__", "elementwise_div", True, None),
-        ("__pow__", "elementwise_pow", False, None),
-        ("__rpow__", "elementwise_pow", True, None),
-        ("__floordiv__", "elementwise_floordiv", False, None),
-        ("__mod__", "elementwise_mod", False, None),
-            # for logical compare
-        ("__eq__", "equal", False, None),
-        ("__ne__", "not_equal", False, None),
-        ("__lt__", "less_than", False, None),
-        ("__le__", "less_equal", False, None),
-        ("__gt__", "greater_than", False, None),
-        ("__ge__", "greater_equal", False, None)):
-
-        setattr(core.VarBase, method_name,
-                _elemwise_method_creator_(method_name, op_type, reverse,
-                                          scalar_method))
-
-    # b = -a
-    core.VarBase.__neg__ = _neg_
-    core.VarBase.__float__ = _float_
-    core.VarBase.__long__ = _long_
-    core.VarBase.__int__ = _int_
-    core.VarBase.__len__ = _len_
-    core.VarBase.__index__ = _index_
-    core.VarBase.astype = astype
-    """
-    When code is written like this
-    y = np.pi * var
-    ndarray.__mul__(self, var) is called, var will be traced as an array(by using __len__, __getitem__), which is not right.
-    when var.__array_ufunc__  is set to None, var.__rmul__(self,  np) will be called.
+    # Todo(zhouwei): implement dygraph template to adapt to any function, receive('op_type', 'arg_template')
+    #  Such as _method_creator_('addmm', 'x, y, alpha=1.0, beta=1.0, name=None'). It can reduce call time.
+    def _method_creator_(op_type, arg_template=None):
+        def __impl__(self):
+            op = getattr(core.ops, op_type)
+            return op(self)
 
-    The details can be seen bellow:
-    https://docs.scipy.org/doc/numpy-1.13.0/neps/ufunc-overrides.html#behavior-in-combination-with-python-s-binary-operations
-    """
-    core.VarBase.__array_ufunc__ = None
+        __impl__.__doc__ = """
+
+        See paddle.{}""".format(op_type)
+        __impl__.__name__ = op_type
+
+        return __impl__
+
+    varbase_methods = [
+        # Type1: From custom fun or lambda
+        ##   b=-a
+        ('__neg__', _neg_),
+        ('__float__', _float_),
+        ('__long__', _long_),
+        ('__int__', _int_),
+        ('__len__', _len_),
+        ('__index__', _index_),
+        ('astype', astype),
+        ('dim', lambda x: len(x.shape)),
+        ('ndimension', lambda x: len(x.shape)),
+        ('ndim', _ndim_),
+        ('size', lambda x: x.shape),
+        # Type2: From Template that create core.ops automatically. It's recommended.
+        ('__add__',
+         _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)),
+        ##  a+b == b+a. Do not need to reverse explicitly
+        ('__radd__',
+         _binary_creator_('__radd__', 'elementwise_add', False, _scalar_add_)),
+        ('__sub__', _binary_creator_('__sub__', 'elementwise_sub', False,
+                                     _scalar_sub_)),
+        ('__rsub__', _binary_creator_('__rsub__', 'elementwise_sub', True,
+                                      _scalar_rsub_)),
+        ('__mul__', _binary_creator_('__mul__', 'elementwise_mul', False,
+                                     _scalar_mul_)),
+        ## a*b == b*a. Do not need to reverse explicitly
+        ('__rmul__',
+         _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__rtruediv__', _binary_creator_('rtruediv__', 'elementwise_div', True,
+                                          None)),
+        ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
+                                     None)),
+        ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
+                                      None)),
+        # These binary use paddle.optype
+        ('__div__', _binary_method_creator_('divide', False)),
+        ('__truediv__', _binary_method_creator_('divide', False)),
+        ('__rtruediv__', _binary_method_creator_('divide', True)),
+        ('__rdiv__', _binary_method_creator_('divide', True)),
+        ('__floordiv__', _binary_method_creator_('floor_divide', False)),
+        ('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
+        ('__mod__', _binary_method_creator_('remainder', False)),
+        ## for logical compare
+        ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
+        ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
+        ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
+        ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
+        ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
+        ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
+        ('__array_ufunc__', None),
+        ('sigmoid', _method_creator_('sigmoid', 'name=None')),
+        ('logsigmoid', _method_creator_('logsigmoid', 'name=None')),
+        ('exp', _method_creator_('exp', 'name=None')),
+        ('tanh', _method_creator_('tanh', 'name=None')),
+        ('atan', _method_creator_('atan', 'name=None')),
+        ('tanh_shrink', _method_creator_('tanh_shrink', 'name=None')),
+        ('sqrt', _method_creator_('sqrt', 'name=None')),
+        ('rsqrt', _method_creator_('rsqrt', 'name=None')),
+        ('abs', _method_creator_('abs', 'name=None')),
+        ('ceil', _method_creator_('ceil', 'name=None')),
+        ('floor', _method_creator_('floor', 'name=None')),
+        ('cos', _method_creator_('cos', 'name=None')),
+        ('acos', _method_creator_('acos', 'name=None')),
+        ('asin', _method_creator_('asin', 'name=None')),
+        ('sin', _method_creator_('sin', 'name=None')),
+        ('sinh', _method_creator_('sinh', 'name=None')),
+        ('cosh', _method_creator_('cosh', 'name=None')),
+        ('round', _method_creator_('round', 'name=None')),
+        ('reciprocal', _method_creator_('reciprocal', 'name=None')),
+        ('square', _method_creator_('square', 'name=None')),
+        ('softplus', _method_creator_('softplus', 'name=None')),
+        ('softsign', _method_creator_('softsign', 'name=None')),
+        # Type3: Form module 'paddle.tensor' defaultly.
+        #   It's not a goodway, because it will increase call time.
+    ]
+
+    global _already_patch_varbase
+    if not _already_patch_varbase:
+        for method in varbase_methods:
+            method_name = method[0]
+            method_impl = method[1]
+            setattr(core.VarBase, method_name, method_impl)
+    else:
+        import paddle.tensor
+        for method_name in common_methods:
+            if hasattr(core.VarBase, method_name): continue
+            method_impl = getattr(paddle.tensor, method_name, None)
+            if method_impl: setattr(core.VarBase, method_name, method_impl)
+
+    _already_patch_varbase = True
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 45744841fc5be5d76585b2dc70841cea5c7ca764..a14c3a81c121758ed90450cd5eb5990f3f7739e1 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import paddle
 from six.moves import reduce
 from .. import core
 from ..layers import utils
@@ -30,12 +31,13 @@ from ..data_feeder import check_variable_and_dtype, check_type
 import numpy as np
 import numbers
 import logging
+import paddle.utils.deprecated as deprecated
 
 __all__ = [
     'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding',
     'GRUUnit', 'InstanceNorm', 'LayerNorm', 'NCE', 'PRelu',
     'BilinearTensorProduct', 'Conv2DTranspose', 'Conv3DTranspose', 'GroupNorm',
-    'SpectralNorm', 'TreeConv', 'Flatten', 'SyncBatchNorm'
+    'SpectralNorm', 'TreeConv', 'Flatten'
 ]
 
 
@@ -2444,6 +2446,10 @@ class BilinearTensorProduct(layers.Layer):
             dtype=self._dtype,
             is_bias=True)
 
+    @deprecated(
+        since="2.0.0",
+        update_to="paddle.nn.Bilinear",
+        reason="New name and new args in Bilinear, easier to use.")
     def forward(self, x, y):
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'BilinearTensorProduct')
@@ -3202,220 +3208,6 @@ class TreeConv(layers.Layer):
         return self._helper.append_activation(pre_activation, act=self._act)
 
 
-class SyncBatchNorm(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
-    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
-    be used as a normalizer function for other operations, such as conv2d and fully connected 
-    operations.
-    The data is normalized by the mean and variance of the channel based on whole mini-batch
-    , which including data in all gpus.
-    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
-    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
-    for more details.
-
-    When model in training mode, the :math:`\\mu_{\\beta}` 
-    and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
-    Calculated as follows:
-
-    ..  math::
-
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
-
-    - :math:`x` : whole mini-batch data in all gpus
-    - :math:`m` : the size of the whole mini-batch data
-
-    When model in evaluation mode, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, 
-    which usually got from the pre-trained model). Global statistics calculated as follows:
-
-    .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
-
-    The formula of normalization is as follows:
- 
-    ..  math::
-
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-
-    - :math:`\\eps` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable scale parameter vector
-    - :math:`\\beta` : trainable shift parameter vector 
-
-    Parameters:
-        num_features(int): Indicate the number of channels of the input ``Tensor``.
-        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
-        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
-             of this layer. If it is set to None or one attribute of ParamAttr, this layerr
-             will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with Xavier. If it is set to False, 
-             this layer will not have trainable scale parameter. Default: None.
-        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
-             If it is set to None or one attribute of ParamAttr, this layer
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. If it is set to False, this layer will not 
-             have trainable bias parameter. Default: None.
-        track_running_stats(bool, optional): Whether to compute global stats, which including running mean and 
-             running variance. Default: True.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-
-          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
-          paddle.disable_static()
-          x = paddle.to_tensor(x)
-          if paddle.fluid.is_compiled_with_cuda():
-              sync_batch_norm = nn.SyncBatchNorm(2)
-              hidden1 = sync_batch_norm(x)
-              print(hidden1.numpy())
-              # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
-    """
-
-    def __init__(self,
-                 num_features,
-                 epsilon=1e-05,
-                 momentum=0.9,
-                 track_running_stats=True,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NCHW',
-                 name=None):
-        super(SyncBatchNorm, self).__init__()
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._num_features = num_features
-        self._data_layout = data_format
-        self._momentum = momentum
-        self._epsilon = epsilon
-        self._track_running_stats = track_running_stats
-
-        if self._track_running_stats == False:
-            logging.warn(
-                "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
-            )
-
-        param_shape = [self._num_features]
-
-        # create parameter
-        if weight_attr == False:
-            self.weight = self.create_parameter(
-                attr=None, shape=param_shape, default_initializer=Constant(1.0))
-            self.weight.stop_gradient = True
-        else:
-            self.weight = self.create_parameter(
-                attr=self._weight_attr,
-                shape=param_shape,
-                default_initializer=Constant(1.0))
-            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
-
-        if bias_attr == False:
-            self.bias = self.create_parameter(
-                attr=None,
-                shape=param_shape,
-                default_initializer=Constant(0.0),
-                is_bias=True)
-            self.bias.stop_gradient = True
-        else:
-            self.bias = self.create_parameter(
-                attr=self._bias_attr, shape=param_shape, is_bias=True)
-            self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
-
-        self._mean = self.create_parameter(
-            attr=ParamAttr(
-                name=None,
-                initializer=Constant(0.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._mean.stop_gradient = True
-
-        self._variance = self.create_parameter(
-            attr=ParamAttr(
-                name=None,
-                initializer=Constant(1.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._variance.stop_gradient = True
-
-    def forward(self, x):
-        # create output
-        # mean and mean_out share the same memory
-        mean_out = self._mean
-        # variance and variance out share the same memory
-        variance_out = self._variance
-
-        ### train mode: use mini-batch stats, eval mode: use global stats
-        if in_dygraph_mode():
-            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
-                     "is_test", not self.training, "data_layout",
-                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
-                     False, "use_global_stats", not self.training,
-                     'trainable_statistics', False)
-            sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
-                x, self.weight, self.bias, self._mean, self._variance, mean_out,
-                variance_out, *attrs)
-
-            return sync_batch_norm_out
-
-        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                                 'BatchNorm')
-
-        attrs = {
-            "momentum": self._momentum,
-            "epsilon": self._epsilon,
-            "is_test": not self.training,
-            "data_layout": self._data_layout,
-            "use_mkldnn": False,
-            "fuse_with_relu": False,
-            "use_global_stats": not self.training,
-            "trainable_statistics": False,
-        }
-
-        inputs = {
-            "X": [x],
-            "Scale": [self.weight],
-            "Bias": [self.bias],
-            "Mean": [self._mean],
-            "Variance": [self._variance]
-        }
-
-        saved_mean = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
-        saved_variance = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
-        sync_batch_norm_out = self._helper.create_variable_for_type_inference(
-            self._dtype)
-
-        outputs = {
-            "Y": [sync_batch_norm_out],
-            "MeanOut": [mean_out],
-            "VarianceOut": [variance_out],
-            "SavedMean": [saved_mean],
-            "SavedVariance": [saved_variance]
-        }
-
-        self._helper.append_op(
-            type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
-        return sync_batch_norm_out
-
-
 class Flatten(layers.Layer):
     """
     :alias_main: paddle.nn.Flatten
@@ -3457,19 +3249,6 @@ class Flatten(layers.Layer):
         self.stop_axis = stop_axis
 
     def forward(self, input):
-        out = self._helper.create_variable_for_type_inference(input.dtype)
-        x_shape = self._helper.create_variable_for_type_inference(input.dtype)
-
-        if in_dygraph_mode():
-            dy_out, _ = core.ops.flatten_contiguous_range(
-                input, 'start_axis', self.start_axis, 'stop_axis',
-                self.stop_axis)
-            return dy_out
-        self._helper.append_op(
-            type="flatten_contiguous_range",
-            inputs={"X": input},
-            outputs={"Out": out,
-                     "XShape": x_shape},
-            attrs={"start_axis": self.start_axis,
-                   "stop_axis": self.stop_axis})
+        out = paddle.tensor.manipulation.flatten(
+            input, start_axis=self.start_axis, stop_axis=self.stop_axis)
         return out
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 24e7f64cebf602965076fdf172ac5ecdc917b624..bd578e6ba98a0f31a952bd5620b90e9464fe8666 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -11,21 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 import six
 import numpy as np
+import warnings
 from collections import OrderedDict
-from .. import core
-from . import layers
-from . import parallel_helper
-from .. import framework
-from . import to_variable, no_grad
+
+from paddle.fluid import core
+from paddle.fluid import framework
+from paddle.fluid.dygraph import layers
+from paddle.fluid.dygraph import parallel_helper
+from paddle.fluid.dygraph import to_variable, no_grad
+from paddle.utils import deprecated
 
 __all__ = ["prepare_context", "ParallelEnv", "DataParallel"]
 
 ParallelStrategy = core.ParallelStrategy
 
 
+@deprecated(since="2.0.0", update_to="paddle.distributed.init_parallel_env")
 def prepare_context(strategy=None):
     '''
     :api_attr: imperative
@@ -39,17 +44,18 @@ def prepare_context(strategy=None):
     if strategy.nranks < 2:
         return
     assert framework.in_dygraph_mode() is True, \
-        "dygraph.prepare_context should be used with dygrahp mode."
+        "dygraph.prepare_context should be used with dygraph mode."
     place = framework._current_expected_place()
     assert place is not None, \
         "dygraph.prepare_context should be used in fluid.dygraph.guard(place) guard."
-    if isinstance(place, core.CUDAPlace):
-        parallel_helper._set_parallel_ctx(
-            core.NCCLParallelContext(strategy, place))
-    else:
-        # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
-        assert ("Only support CUDAPlace for now.")
-    parallel_helper._init_parallel_ctx()
+    if not parallel_helper._is_parallel_ctx_initialized():
+        if isinstance(place, core.CUDAPlace):
+            parallel_helper._set_parallel_ctx(
+                core.NCCLParallelContext(strategy, place))
+        else:
+            # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
+            assert ("Only support CUDAPlace for now.")
+        parallel_helper._init_parallel_ctx()
     return strategy
 
 
@@ -112,84 +118,84 @@ class ParallelEnv(object):
     """
 
     def __init__(self):
-        self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
-        self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-        self._dev_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        self._rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self._world_size = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        self._device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                             "").split(",")
         self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
 
     @property
-    def nranks(self):
+    def rank(self):
         """
-        The number of trainers, generally refers to the number of GPU cards used in training.
+        Rank of current trainer.
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINERS_NUM. The default value is 1.
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . The default value is 0.
 
         Examples:
           .. code-block:: python
 
-            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
-            import paddle.fluid as fluid
+            # execute this command in terminal: export PADDLE_TRAINER_ID=0
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
-            print("The nranks is %d" % env.nranks)
-            # The nranks is 4
+            env = dist.ParallelEnv()
+            print("The rank is %d" % env.rank)
+            # The rank is 0
         """
-        return self._nranks
+        return self._rank
 
     @property
-    def local_rank(self):
+    def world_size(self):
         """
-        The current trainer number.
+        The number of trainers (number of processes participating in current job).
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINER_ID. The default value is 0.
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . The default value is 1.
 
         Examples:
           .. code-block:: python
 
-            # execute this command in terminal: export PADDLE_TRAINER_ID=0
-            import paddle.fluid as fluid
+            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
-            print("The local rank is %d" % env.local_rank)
-            # The local rank is 0
+            env = dist.ParallelEnv()
+            print("The world_size is %d" % env.world_size)
+            # The world_size is 4
         """
-        return self._local_rank
+        return self._world_size
 
     @property
-    def dev_id(self):
+    def device_id(self):
         """
         The ID of selected GPU card for parallel training.
 
-        Its value is equal to the value of the environment variable FLAGS_selected_gpus. The default value is 0.
+        Its value is equal to the value of the environment variable ``FLAGS_selected_gpus`` . The default value is 0.
 
         Examples:
           .. code-block:: python
 
             # execute this command in terminal: export FLAGS_selected_gpus=1
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
-            print("The device id are %d" % env.dev_id)
+            env = dist.ParallelEnv()
+            print("The device id are %d" % env.device_id)
             # The device id are 1
         """
-        return self._dev_id
+        return self._device_id
 
     @property
     def current_endpoint(self):
         """
         The endpoint of current trainer, it is in the form of (node IP + port).
 
-        Its value is equal to the value of the environment variable PADDLE_CURRENT_ENDPOINT. The default value is "".
+        Its value is equal to the value of the environment variable ``PADDLE_CURRENT_ENDPOINT`` . The default value is "".
 
         Examples:
           .. code-block:: python
             
             # execute this command in terminal: export PADDLE_CURRENT_ENDPOINT=127.0.0.1:6170
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
+            env = dist.ParallelEnv()
             print("The current endpoint are %s" % env.current_endpoint)
             # The current endpoint are 127.0.0.1:6170
         """
@@ -201,20 +207,25 @@ class ParallelEnv(object):
         The endpoints of all trainer nodes in the task, 
         which are used to broadcast the NCCL ID when NCCL2 is initialized.
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINER_ENDPOINTS. The default value is "".
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ENDPOINTS`` . The default value is "".
 
         Examples:
           .. code-block:: python
 
             # execute this command in terminal: export PADDLE_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
+            env = dist.ParallelEnv()
             print("The trainer endpoints are %s" % env.trainer_endpoints)
             # The trainer endpoints are ['127.0.0.1:6170', '127.0.0.1:6171']
         """
         return self._trainer_endpoints
 
+    # [aliases] Compatible with old method names
+    local_rank = rank
+    nranks = world_size
+    dev_id = device_id
+
 
 # NOTE: [ Compatible ] Originally this class name is `Env`. The semantics of the old class names
 # are inaccurate and may confuse users, so replace it with `ParallelEnv`, but to be compatible
@@ -227,64 +238,98 @@ class DataParallel(layers.Layer):
     Run the dygraph module with data parallelism.
 
     Currently, DataParallel class only supports to run the dynamic graph
-    with multi-process. The usage is:
-    `python -m paddle.distributed.launch --selected_gpus=0,1 dynamic_graph_test.py`.
-    And the content of `dynamic_graph_test.py` is the code of examples.
+    with multi-process. 
+    
+    Now supports two ways to start training:
+
+    1. start by ``paddle.distributed.spawn`` method, for example:
+
+        ``python demo.py`` (spawn need to be called in ``__main__`` method)
+    
+    2. start by ``paddle.distributed.launch`` module, for example:
+    
+        ``python -m paddle.distributed.launch --selected_gpus=0,1 demo.py`` .
+
+    And the content of `demo.py` is the code of examples.
 
     Args:
         layers(Layer): The module that should be executed by data parallel.
-        strategy(ParallelStrategy): The strategy of data parallelism, contains 
-            environment configuration related to parallel execution.
-
+        strategy(ParallelStrategy, optional): (deprecated) The strategy of data parallelism, 
+            contains environment configuration related to parallel execution. Default: None.
+            
     Returns:
         Layer: The data paralleled module.
 
     Examples:
         .. code-block:: python
 
-           import numpy as np
-           import paddle.fluid as fluid
-           import paddle.fluid.dygraph as dygraph
-           from paddle.fluid.optimizer import AdamOptimizer
-           from paddle.fluid.dygraph.nn import Linear
-           from paddle.fluid.dygraph.base import to_variable
-
-           place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-           with fluid.dygraph.guard(place=place):
-
-               # prepare the data parallel context
-               strategy=dygraph.prepare_context()
-
-               linear = Linear(1, 10, act="softmax")
-               adam = fluid.optimizer.AdamOptimizer()
-
-               # make the module become the data parallelism module
-               linear = dygraph.DataParallel(linear, strategy)
-
-               x_data = np.random.random(size=[10, 1]).astype(np.float32)
-               data = to_variable(x_data)
-
-               hidden = linear(data)
-               avg_loss = fluid.layers.mean(hidden)
-
-               # scale the loss according to the number of trainers.
-               avg_loss = linear.scale_loss(avg_loss)
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
+            import paddle.distributed as dist
 
-               avg_loss.backward()
-
-               # collect the gradients of trainers.
-               linear.apply_collective_grads()
-
-               adam.minimize(avg_loss)
-               linear.clear_gradients()
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+                    
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train():
+                # 1. enable dynamic mode
+                paddle.disable_static()
+                
+                # 2. initialize parallel environment
+                dist.init_parallel_env()
+
+                # 3. create data parallel layer & optimizer
+                layer = LinearNet()
+                dp_layer = paddle.DataParallel(layer)
+
+                loss_fn = nn.MSELoss()
+                adam = opt.Adam(
+                    learning_rate=0.001, parameters=dp_layer.parameters())
+
+                # 4. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+                
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            if __name__ == '__main__':
+                # 1. start by ``paddle.distributed.spawn`` (default)
+                dist.spawn(train, nprocs=2)
+                # 2. start by ``paddle.distributed.launch``
+                # train()
     """
 
-    def __init__(self, layers, strategy):
+    def __init__(self, layers, strategy=None):
         super(DataParallel,
               self).__init__(layers.full_name() + "_data_parallel")
 
         self._layers = layers
-        self._strategy = strategy
+
+        # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
+        # It just stores some environment variables, which can be constructed by 
+        # ParallelEnv. Here it is set as an optional argument.
+        # This parameter is not removed because of compatibility with 1.x writing.
+        if strategy is not None:
+            self._strategy = strategy
+        else:
+            self._strategy = ParallelStrategy()
+            self._strategy.nranks = ParallelEnv().nranks
+            self._strategy.local_rank = ParallelEnv().local_rank
+            self._strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+            self._strategy.current_endpoint = ParallelEnv().current_endpoint
 
     def forward(self, *inputs, **kwargs):
         return self._layers(*inputs, **kwargs)
@@ -306,20 +351,23 @@ class DataParallel(layers.Layer):
 
                 import numpy as np
                 import paddle.fluid as fluid
-                import paddle.fluid.dygraph as dygraph
-                from paddle.fluid.optimizer import AdamOptimizer
-                from paddle.fluid.dygraph.nn import Linear
-                from paddle.fluid.dygraph.base import to_variable
-
-                place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-                with fluid.dygraph.guard(place=place):
-                    strategy=dygraph.prepare_context()
-                    linear = Linear(1, 10, act="softmax")
-                    adam = fluid.optimizer.AdamOptimizer()
-                    linear = dygraph.DataParallel(linear, strategy)
+
+                place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
+                with fluid.dygraph.guard(place):
+
+                    # prepare the data parallel context
+                    strategy = fluid.dygraph.prepare_context()
+
+                    linear = fluid.dygraph.Linear(1, 10, act="softmax")
+                    adam = fluid.optimizer.AdamOptimizer(
+                        learning_rate=0.001, parameter_list=linear.parameters())
+
+                    # make the module become the data parallelism module
+                    linear = fluid.dygraph.DataParallel(linear, strategy)
 
                     x_data = np.random.random(size=[10, 1]).astype(np.float32)
-                    data = to_variable(x_data)
+                    data = fluid.dygraph.to_variable(x_data)
+
                     hidden = linear(data)
                     avg_loss = fluid.layers.mean(hidden)
 
@@ -327,6 +375,8 @@ class DataParallel(layers.Layer):
                     avg_loss = linear.scale_loss(avg_loss)
 
                     avg_loss.backward()
+
+                    # collect the gradients of trainers.
                     linear.apply_collective_grads()
 
                     adam.minimize(avg_loss)
@@ -390,23 +440,29 @@ class DataParallel(layers.Layer):
 
                 import numpy as np
                 import paddle.fluid as fluid
-                import paddle.fluid.dygraph as dygraph
-                from paddle.fluid.optimizer import AdamOptimizer
-                from paddle.fluid.dygraph.nn import Linear
-                from paddle.fluid.dygraph.base import to_variable
-
-                place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-                with fluid.dygraph.guard(place=place):
-                    strategy=dygraph.prepare_context()
-                    linear = Linear(1, 10, act="softmax")
-                    adam = fluid.optimizer.AdamOptimizer()
-                    linear = dygraph.DataParallel(linear, strategy)
+
+                place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
+                with fluid.dygraph.guard(place):
+
+                    # prepare the data parallel context
+                    strategy = fluid.dygraph.prepare_context()
+
+                    linear = fluid.dygraph.Linear(1, 10, act="softmax")
+                    adam = fluid.optimizer.AdamOptimizer(
+                        learning_rate=0.001, parameter_list=linear.parameters())
+
+                    # make the module become the data parallelism module
+                    linear = fluid.dygraph.DataParallel(linear, strategy)
 
                     x_data = np.random.random(size=[10, 1]).astype(np.float32)
-                    data = to_variable(x_data)
+                    data = fluid.dygraph.to_variable(x_data)
+
                     hidden = linear(data)
                     avg_loss = fluid.layers.mean(hidden)
+
+                    # scale the loss according to the number of trainers.
                     avg_loss = linear.scale_loss(avg_loss)
+
                     avg_loss.backward()
 
                     # collect the gradients of trainers.
diff --git a/python/paddle/fluid/dygraph/parallel_helper.py b/python/paddle/fluid/dygraph/parallel_helper.py
index f378211de2b8a1579ab139318cdc3cb8d5bdc2de..ff1675f0ae8a40b2487d5834b262a1b730641262 100644
--- a/python/paddle/fluid/dygraph/parallel_helper.py
+++ b/python/paddle/fluid/dygraph/parallel_helper.py
@@ -23,6 +23,11 @@ def _is_data_parallel_mode():
         os.getenv("PADDLE_TRAINERS_NUM", "1")) > 1
 
 
+def _is_parallel_ctx_initialized():
+    global __parallel_ctx__clz__
+    return __parallel_ctx__clz__ is not None
+
+
 def _set_parallel_ctx(nccl_parallel_context):
     global __parallel_ctx__clz__
     assert __parallel_ctx__clz__ is None, \
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 9dbaab2580d21397fa7a4e03b03a5f1c4ac887f2..7cb17843396a6ed79c36126172a253864dbf3d0f 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -15,7 +15,6 @@
 import inspect
 from .. import framework
 from .. import core
-from . import BackwardStrategy
 from ..framework import Variable, Parameter, ParamBase
 from .base import switch_to_static_graph
 import numpy as np
@@ -129,19 +128,18 @@ def monkey_patch_varbase():
                                       framework._current_expected_place())
 
     @framework.dygraph_only
-    def backward(self, backward_strategy=None, retain_graph=False):
+    def backward(self, retain_graph=False):
         """
         **Notes**:
             **This API is ONLY available in Dygraph mode**
 
-        Run backward of current Graph which starts from current Variable
+        Run backward of current Graph which starts from current Tensor.
 
         Args:
-            backward_strategy( :ref:`api_fluid_dygraph_BackwardStrategy` ): The Backward Strategy to run backward
             retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
-            like to add more ops to the built graph after calling this method(`backward`), set the parameter
-            `retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
-            Defaults to False.
+                like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
+                :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+                Defaults to False.
 
         Returns:
             NoneType: None
@@ -149,32 +147,25 @@ def monkey_patch_varbase():
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
                 import numpy as np
+                import paddle
+                paddle.disable_static()
 
                 x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    inputs2 = []
-                    for _ in range(10):
-                        tmp = fluid.dygraph.base.to_variable(x)
-                        # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
-                        # there is no one need gradient on it.
-                        tmp.stop_gradient=False
-                        inputs2.append(tmp)
-                    ret2 = fluid.layers.sums(inputs2)
-                    loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                inputs = []
+                for _ in range(10):
+                    tmp = paddle.to_tensor(x)
+                    # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
+                    # there is no one need gradient on it.
+                    tmp.stop_gradient=False
+                    inputs.append(tmp)
+                ret = paddle.sums(inputs)
+                loss = paddle.reduce_sum(ret)
+                loss.backward()
 
         """
         if framework.in_dygraph_mode():
-            if backward_strategy is None:
-                backward_strategy = BackwardStrategy()
-                backward_strategy.sort_sum_gradient = False
-
-            self._run_backward(backward_strategy,
-                               framework._dygraph_tracer(), retain_graph)
+            self._run_backward(framework._dygraph_tracer(), retain_graph)
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")
@@ -205,9 +196,7 @@ def monkey_patch_varbase():
                         inputs2.append(tmp)
                     ret2 = fluid.layers.sums(inputs2)
                     loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     print(loss2.gradient())
 
         """
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 5759b94276351b2257769a619487aa6e14858f97..2e3f34f41648a9343b4bccd1044bcd3f7b3d8189 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -110,7 +110,7 @@ def scope_guard(scope):
         _switch_scope(ex)
 
 
-def as_numpy(tensor):
+def as_numpy(tensor, copy=False):
     """
     Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information.
     For higher dimensional sequence data, please use LoDTensor directly.
@@ -129,6 +129,7 @@ def as_numpy(tensor):
 
     Args:
        tensor(Variable): a instance of Tensor
+       copy(bool, optional): Whether to use deep copy.
 
     Returns:
         numpy.ndarray
@@ -145,7 +146,10 @@ def as_numpy(tensor):
             Please set the parameter 'return_numpy' as 'False' to \
             return LoDTensor itself directly.")
     if tensor._is_initialized():
-        return np.array(tensor)
+        if copy:
+            return np.array(tensor)
+        else:
+            return np.asarray(tensor)
     else:
         return None
 
@@ -350,7 +354,7 @@ def _fetch_var(name, scope=None, return_numpy=True):
         " program.")
     tensor = var.get_tensor()
     if return_numpy:
-        tensor = as_numpy(tensor)
+        tensor = as_numpy(tensor, copy=True)
     return tensor
 
 
@@ -850,6 +854,7 @@ class Executor(object):
 
     def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
                       return_numpy, return_merged):
+        from paddle.optimizer.lr_scheduler import _LRScheduler
         exe = program._executor
         # TODO(zhenghuihuang): quantization uses Graph in CompiledProgram
         # instead of program. We will add support for checking Vars in Graph
@@ -893,6 +898,16 @@ class Executor(object):
                 res.append(res_dict)
             exe.feed_tensors_into_local_scopes(res)
 
+        if hasattr(program._program, 'lr_sheduler'):
+            lr_sheduler = program._program.lr_sheduler
+            assert isinstance(lr_sheduler, _LRScheduler), "must be _LRScheduler"
+            lr_value = lr_sheduler()
+            lr_var = program._program.global_block().vars[lr_sheduler._var_name]
+            lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype)
+            exe.feed_and_split_tensor_into_local_scopes({
+                lr_sheduler._var_name: lr_tensor
+            })
+
         fetch_var_names = list(map(_to_name_str, fetch_list))
         tensors = exe.run(fetch_var_names, return_merged)._move_to_list()
         return as_numpy(tensors) if return_numpy else tensors
@@ -1222,7 +1237,7 @@ class Executor(object):
 
     def _run_program(self, program, feed, fetch_list, feed_var_name,
                      fetch_var_name, scope, return_numpy, use_program_cache):
-
+        from paddle.optimizer.lr_scheduler import _LRScheduler
         if feed is None:
             feed = {}
         elif isinstance(feed, (list, tuple)):
@@ -1278,6 +1293,16 @@ class Executor(object):
                 fetch_var_name=fetch_var_name)
 
         self._feed_data(program, feed, feed_var_name, scope)
+        if hasattr(program, 'lr_sheduler'):
+            assert isinstance(program.lr_sheduler,
+                              _LRScheduler), "must be _LRScheduler"
+            lr_sheduler = program.lr_sheduler
+            lr_value = lr_sheduler()
+            lr_var = program.global_block().vars[lr_sheduler._var_name]
+            data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
+            tensor = core.get_variable_tensor(scope, lr_sheduler._var_name)
+            tensor.set(data, self.place)
+
         if not use_program_cache:
             self._default_executor.run(program.desc, scope, 0, True, True,
                                        fetch_var_name)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index e844c74c106e378833b35065ba91ef88159e2553..fc4e91aad4fff1db325e17828d26ccd94c164c3d 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -48,6 +48,7 @@ __all__ = [
     'cuda_pinned_places',
     'in_dygraph_mode',
     'is_compiled_with_cuda',
+    'is_compiled_with_xpu',
     'Variable',
     'ComplexVariable',
     'load_op_library',
@@ -310,6 +311,21 @@ def _cuda_ids():
     return device_ids
 
 
+def is_compiled_with_xpu():
+    """
+    Whether this whl package can be used to run the model on XPU.
+
+    Returns (bool): support xpu or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            support_xpu = fluid.is_compiled_with_xpu()
+    """
+    return core.is_compiled_with_xpu()
+
+
 def is_compiled_with_cuda():
     """
     Whether this whl package can be used to run the model on GPU.
@@ -1090,15 +1106,18 @@ class Variable(object):
         pass
 
     @fake_interface_only
-    def backward(self, backward_strategy=None):
+    def backward(self, retain_graph=False):
         """
         **Notes**:
             **This API is ONLY available in Dygraph mode**
 
-        Run backward of current Graph which starts from current Variable
+        Run backward of current Graph which starts from current Tensor.
 
         Args:
-            backward_strategy( :ref:`api_fluid_dygraph_BackwardStrategy` ): The Backward Strategy to run backward
+            retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
+                like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
+                :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+                Defaults to False.
 
         Returns:
             NoneType: None
@@ -1106,23 +1125,21 @@ class Variable(object):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
                 import numpy as np
+                import paddle
+                paddle.disable_static()
 
                 x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    inputs2 = []
-                    for _ in range(10):
-                        tmp = fluid.dygraph.base.to_variable(x)
-                        # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
-                        # there is no one need gradient on it.
-                        tmp.stop_gradient=False
-                        inputs2.append(tmp)
-                    ret2 = fluid.layers.sums(inputs2)
-                    loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                inputs = []
+                for _ in range(10):
+                    tmp = paddle.to_tensor(x)
+                    # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
+                    # there is no one need gradient on it.
+                    tmp.stop_gradient=False
+                    inputs.append(tmp)
+                ret = paddle.sums(inputs)
+                loss = paddle.reduce_sum(ret)
+                loss.backward()
 
         """
         pass
@@ -1154,9 +1171,7 @@ class Variable(object):
                         inputs2.append(tmp)
                     ret2 = fluid.layers.sums(inputs2)
                     loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     print(loss2.gradient())
 
                 # example2: return tuple of ndarray
@@ -1202,9 +1217,7 @@ class Variable(object):
                         inputs2.append(tmp)
                     ret2 = fluid.layers.sums(inputs2)
                     loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     print(loss2.gradient())
                     loss2.clear_gradient()
                     print("After clear {}".format(loss2.gradient()))
@@ -4434,6 +4447,8 @@ class Program(object):
             p._current_role = self._current_role
             p.__op_role_var = self.__op_role_var
             p._appending_grad_times = self._appending_grad_times
+            if hasattr(self, 'lr_sheduler'):
+                p.lr_sheduler = self.lr_sheduler
 
             #NOTE(zhiqiu): we sync the cloned program, to update its program by
             # its desc.
diff --git a/python/paddle/fluid/generator.py b/python/paddle/fluid/generator.py
index 24262e3f5666abe425365009ceaf6e6400efc76d..98924f801413bcd822a0d9a6fd61adcc4d00fddc 100644
--- a/python/paddle/fluid/generator.py
+++ b/python/paddle/fluid/generator.py
@@ -17,44 +17,28 @@ from . import core
 
 __all__ = ['Generator']
 
-default_rng_seed_val = 34342423252
 
-
-class Generator(object):
+class Generator(core.Generator):
     """Generator class"""
 
-    def __init__(self, device="CPU"):
-        """init"""
-        self.device = device
-        seed_in = default_rng_seed_val
-        if self.device == "CPU":
-            self.generator = core.Generator()
-            self.generator.manual_seed(seed_in)
-        else:
-            raise ValueError(
-                "generator class with device %s does not exist, currently only support generator with device 'CPU' "
-                % device)
-
-    def get_state(self):
-        return self.generator.get_state()
-
-    def set_state(self, state):
-        self.generator.set_state(state)
+    def __init__(self, place=None):
+        """
+        Create a generator object which manages the random number generation. ( Experimental Feature )
 
-    def manual_seed(self, seed):
-        self.generator.manual_seed(seed)
+        Parameters:
+            place(CPUPlace|CUDAPinnedPlace|CUDAPlace, optional): The place to allocate Tensor. Can be  
+                CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place.
 
-    def seed(self):
-        return self.generator.seed()
+        Returns:
+            Generator: A generator object.
 
-    def initial_seed(self):
-        return self.generator.initial_seed()
-
-    def random(self):
-        return self.generator.random()
-
-    def get_cpu_engine(self):
-        return self.generator.get_cpu_engine()
-
-    def set_cpu_engine(self, cpu_engine):
-        self.generator.set_cpu_engine(cpu_engine)
+        """
+        self.place = place
+        if not place:
+            place = core.CPUPlace()
+        if isinstance(place, core.CPUPlace):
+            super(Generator, self).__init__()
+        else:
+            raise ValueError(
+                "Generator class with %s does is not supported yet, currently only support generator with CPUPlace "
+                % place)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
index d2c7397c85f8df155444d9272c7b75596f0fe169..1a7a82fbfac19b41e8b96c231ca74398f6b2214c 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -38,6 +38,7 @@ from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
 from paddle.fluid.incubate.fleet.parameter_server import version
 from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames
 from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops
+from paddle.fluid.incubate.fleet.parameter_server.ir.public import _has_global_step
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import TrainerRuntimeConfig, DistributedStrategy, \
     SyncStrategy, AsyncStrategy, HalfAsyncStrategy, GeoStrategy, StrategyFactory
 
@@ -161,9 +162,9 @@ class FleetTranspiler(Fleet):
 
         print(trainer_config)
 
-        lrs = _get_lr_ops(self._origin_main_program)
+        lrs = _has_global_step(_get_lr_ops(self._origin_main_program))
 
-        if len(lrs) > 0:
+        if lrs > 0:
             kwargs = {"need_global_step": "1"}
         else:
             kwargs = {"need_global_step": "0"}
@@ -186,14 +187,6 @@ class FleetTranspiler(Fleet):
             recv_ctx = fleet.compiled_config.get_communicator_recv_context(
                 recv_type=1)
 
-        for name, ctx in send_ctx.items():
-            print("name: {}, ctx: {}".format(name, ctx))
-
-        print("==== = ==== =============== ====")
-
-        for name, ctx in recv_ctx.items():
-            print("name: {}, ctx: {}".format(name, ctx))
-
         from paddle.fluid.communicator import Communicator
         self._communicator = Communicator(
             trainer_config.mode, kwargs,
@@ -393,6 +386,12 @@ class FleetTranspiler(Fleet):
                 "in fleet.save_inference_model() function, executor must be as Executor type"
             )
 
+        # Todo(MrChengmo): support recv&save GPU-Kernel for ps-gpu model save
+        if not isinstance(executor.place, fluid.CPUPlace):
+            save_executor = Executor(fluid.CPUPlace())
+        else:
+            save_executor = executor
+
         if main_program is not None:
             if isinstance(main_program, CompiledProgram):
                 raise TypeError(
@@ -670,6 +669,11 @@ if you would like to save all variables in a
             raise TypeError(
                 "in fleet.save_persistables() function, executor must be as Executor type"
             )
+        # Todo(MrChengmo): support recv&save GPU-Kernel for ps-gpu model save
+        if not isinstance(executor.place, fluid.CPUPlace):
+            save_executor = Executor(fluid.CPUPlace())
+        else:
+            save_executor = executor
 
         if main_program is None:
             main_program = self.main_program
@@ -679,7 +683,8 @@ if you would like to save all variables in a
                 "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
             )
 
-        self._save_distributed_persistables(executor, dirname, main_program)
+        self._save_distributed_persistables(save_executor, dirname,
+                                            main_program)
 
     @staticmethod
     def __exclude_vars(exclude_var_names=[]):
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index b96eff19e9b9c5d8e78b85e61b9a69afee106546..f9889997d9e38c98c4a736a62dbc72da7029f337 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -43,6 +43,8 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.ps_dispatcher import RoundR
 OP_NAME_SCOPE = "op_namescope"
 CLIP_OP_NAME_SCOPE = "@CLIP"
 STEP_COUNTER = "@PS_STEP_COUNTER@"
+LEARNING_RATE_DECAY_COUNTER = "@LR_DECAY_COUNTER@"
+
 OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
 RPC_OP_ROLE_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleAttrName()
 RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
@@ -62,6 +64,17 @@ def _get_lr_ops(program):
     return lr_ops
 
 
+def _has_global_step(lr_ops):
+    if len(lr_ops) > 0:
+        for idx, op in enumerate(lr_ops):
+            if op.type != 'increment':
+                continue
+            counter = op.input("X")[0]
+            if counter == LEARNING_RATE_DECAY_COUNTER:
+                return True
+    return False
+
+
 def is_sparse_op(op):
     if op.type == "lookup_table" and op.attr('is_sparse') is True and op.attr(
             'is_distributed') is False:
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index 347927509e6d539555bd4d1b7a594febbc68f57b..15a3022f932f4a702bf7f94ed936468b6a06e94e 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -17,10 +17,12 @@ import warnings
 from .framework import Variable, in_dygraph_mode
 from .layer_helper import LayerHelper
 from .data_feeder import check_variable_and_dtype, check_dtype
+from ..utils import deprecated
 
 __all__ = ['one_hot', 'embedding']
 
 
+@deprecated(since='2.0.0', update_to='paddle.nn.functional.one_hot')
 def one_hot(input, depth, allow_out_of_range=False):
     """
     :alias_main: paddle.nn.functional.one_hot
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 0e813e21ea3c0677fff8e9ac06af654ca52c02c4..ef469377acfbc0c2c521de61f8eacc0f7c9f0854 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -45,10 +45,23 @@ class SimpleLayer(Layer):
 
 
 def run_check():
-    ''' install check to verify if install is success
-
+    """To check whether install is successful
     This func should not be called only if you need to verify installation
-    '''
+
+    Examples:
+        .. code-block: python
+
+            import paddle.fluid as fluid
+            fluid.install_check.run_check()
+
+            # If installed successfully, output may be
+            # Running Verify Fluid Program ... 
+            # W0805 04:24:59.496919 35357 device_context.cc:268] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.2, Runtime API Version: 10.1
+            # W0805 04:24:59.505594 35357 device_context.cc:276] device: 0, cuDNN Version: 7.6.
+            # Your Paddle Fluid works well on SINGLE GPU or CPU.
+            # Your Paddle Fluid works well on MUTIPLE GPU or CPU.
+            # Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now
+    """
     print("Running Verify Fluid Program ... ")
 
     device_list = []
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 10989d13b35056b367d08e8a251ca02a5a379500..7aedb2ca2566c6da196b655d1aa24ed99150a709 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -274,7 +274,6 @@ def generate_activation_fn(op_type):
         return output
 
     func.__name__ = op_type
-    func.__module__ = "paddle.fluid.layers"
     func.__doc__ = _generate_doc_string_(
         op_proto,
         additional_args_lines=[
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index fd1e7f800b928cfcecb9e09877f08c42c81defa6..38fc34472c8bc64338e2468bdf3f4b0bab1370ce 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import warnings
 import inspect
+import paddle
 
 from .. import core
 from ..framework import Variable, unique_name
@@ -45,6 +46,7 @@ EXPRESSION_MAP = {
     "__pow__": "A ** B",
     "__rpow__": "A **= B",
     "__floordiv__": "A //B",
+    "__rfloordiv__": "A //= B",
     "__mod__": "A % B",
     "__eq__": "A == B",
     "__ne__": "A != B",
@@ -54,6 +56,31 @@ EXPRESSION_MAP = {
     "__ge__": "A >= B"
 }
 
+# method for Tensor from paddle.tensor
+# edit it when paddle.tensor has new method about Tensor operation
+common_methods = [
+    'exp', 'tanh', 'atan', 'sqrt', 'rsqrt', 'abs', 'ceil', 'floor', 'cos',
+    'acos', 'asin', 'sin', 'sinh', 'cosh', 'round', 'reciprocal', 'square',
+    'rank', 'matmul', 'dot', 'norm', 'transpose', 'dist', 't', 'cross',
+    'cholesky', 'bmm', 'histogram', 'equal', 'greater_equal', 'greater_than',
+    'is_empty', 'isfinite', 'less_equal', 'less_than', 'logical_and',
+    'logical_not', 'logical_or', 'logical_xor', 'not_equal', 'reduce_all',
+    'reduce_any', 'allclose', 'equal_all', 'cast', 'expand', 'expand_as',
+    'tile', 'flatten', 'gather', 'gather_nd', 'reshape', 'reverse', 'scatter',
+    'scatter_nd_add', 'scatter_nd', 'shard_index', 'slice', 'split', 'squeeze',
+    'strided_slice', 'unique', 'unique_with_counts', 'unsqueeze', 'flip',
+    'unbind', 'roll', 'cumsum', 'increment', 'log', 'pow', 'reciprocal',
+    'round', 'rsqrt', 'scale', 'sign', 'stanh', 'sum', 'reduce_prod', 'max',
+    'min', 'mm', 'div', 'multiply', 'add', 'logsumexp', 'log1p', 'erf',
+    'addcmul', 'addmm', 'clamp', 'trace', 'kron', 'argmax', 'argmin', 'argsort',
+    'has_inf', 'has_nan', 'topk', 'index_select', 'nonzero', 'sort',
+    'index_sample', 'mean', 'std', 'var', 'elementwise_add', 'elementwise_div',
+    'elementwise_floordiv', 'elementwise_mod', 'elementwise_pow',
+    'elementwise_sub'
+]
+
+_already_patch_variable = False
+
 
 def monkey_patch_variable():
     def unique_tmp_name():
@@ -179,7 +206,7 @@ def monkey_patch_variable():
                    "out_dtype": out.dtype})
         return out
 
-    def _scalar_elementwise_op_(var, scale, bias):
+    def _scalar_op_(var, scale, bias):
         block = current_block(var)
         out = create_new_tmp_var(block, var.dtype)
         block.append_op(
@@ -191,27 +218,46 @@ def monkey_patch_variable():
         return out
 
     def _neg_(var):
-        return _scalar_elementwise_op_(var, -1.0, 0.0)
+        return _scalar_op_(var, -1.0, 0.0)
+
+    def _scalar_add_(var, value):
+        return _scalar_op_(var, 1.0, value)
 
-    def _scalar_elementwise_add_(var, value):
-        return _scalar_elementwise_op_(var, 1.0, value)
+    def _scalar_sub_(var, value):
+        return _scalar_op_(var, 1.0, -value)
 
-    def _scalar_elementwise_sub_(var, value):
-        return _scalar_elementwise_op_(var, 1.0, -value)
+    def _scalar_rsub_(var, value):
+        return _scalar_op_(var, -1.0, value)
 
-    def _scalar_elementwise_rsub_(var, value):
-        return _scalar_elementwise_op_(var, -1.0, value)
+    def _scalar_mul_(var, value):
+        return _scalar_op_(var, value, 0.0)
 
-    def _scalar_elementwise_mul_(var, value):
-        return _scalar_elementwise_op_(var, value, 0.0)
+    def _scalar_div_(var, value):
+        return _scalar_op_(var, 1.0 / value, 0.0)
 
-    def _scalar_elementwise_div_(var, value):
-        return _scalar_elementwise_op_(var, 1.0 / value, 0.0)
+    # TODO(shenliang03):  currently, it supports divide, floor_divide, remainder
+    # for binary operator by using the api to achieve the type promotion
+    def _binary_method_creator_(op_type, reverse=False):
+        import paddle
+
+        def __impl__(self, other_var):
+            op = getattr(paddle, op_type)
+            if reverse:
+                return op(other_var, self)
+            else:
+                return op(self, other_var)
+
+        __impl__.__doc__ = """
+
+        See paddle.{}""".format(op_type)
+        __impl__.__name__ = op_type
+
+        return __impl__
 
-    def _elemwise_method_creator_(method_name,
-                                  op_type,
-                                  reverse=False,
-                                  scalar_method=None):
+    def _binary_creator_(method_name,
+                         op_type,
+                         reverse=False,
+                         scalar_method=None):
         def __impl__(self, other_var):
             # FIXME(zjl): elementwise_div between integers cannot be converted to scale,
             # which may lose accuracy. This is a hot fix for release 1.6.
@@ -296,35 +342,56 @@ def monkey_patch_variable():
         __impl__.__name__ = method_name
         return __impl__
 
-    # inject methods
-    for method_name, op_type, reverse, scalar_method in (
-        ("__add__", "elementwise_add", False, _scalar_elementwise_add_),
-            # a+b == b+a. Do not need to reverse explicitly
-        ("__radd__", "elementwise_add", False, _scalar_elementwise_add_),
-        ("__sub__", "elementwise_sub", False, _scalar_elementwise_sub_),
-        ("__rsub__", "elementwise_sub", True, _scalar_elementwise_rsub_),
-        ("__mul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-            # a*b == b*a. Do not need to reverse explicitly
-        ("__rmul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-        ("__div__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__truediv__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__rdiv__", "elementwise_div", True, None),
-        ("__rtruediv__", "elementwise_div", True, None),
-        ("__pow__", "elementwise_pow", False, None),
-        ("__rpow__", "elementwise_pow", True, None),
-        ("__floordiv__", "elementwise_floordiv", False, None),
-        ("__mod__", "elementwise_mod", False, None),
-            # for logical compare
-        ("__eq__", "equal", False, None),
-        ("__ne__", "not_equal", False, None),
-        ("__lt__", "less_than", False, None),
-        ("__le__", "less_equal", False, None),
-        ("__gt__", "greater_than", False, None),
-        ("__ge__", "greater_equal", False, None)):
-        setattr(Variable, method_name,
-                _elemwise_method_creator_(method_name, op_type, reverse,
-                                          scalar_method))
-
-    # b = -a
-    Variable.__neg__ = _neg_
-    Variable.astype = astype
+    variable_methods = [
+        #   b=-a
+        ('__neg__', _neg_),
+        ('astype', astype),
+        ('__add__', _binary_creator_('__add__', 'elementwise_add', False,
+                                     _scalar_add_)),
+        #  a+b == b+a. Do not need to reverse explicitly
+        ('__radd__',
+         _binary_creator_('__radd__', 'elementwise_add', False, _scalar_add_)),
+        ('__sub__', _binary_creator_('__sub__', 'elementwise_sub', False,
+                                     _scalar_sub_)),
+        ('__rsub__', _binary_creator_('__rsub__', 'elementwise_sub', True,
+                                      _scalar_rsub_)),
+        ('__mul__', _binary_creator_('__mul__', 'elementwise_mul', False,
+                                     _scalar_mul_)),
+        #  a*b == b*a. Do not need to reverse explicitly
+        ('__rmul__',
+         _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
+                                     None)),
+        ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
+                                      None)),
+        # These binary use paddle.optype
+        ('__div__', _binary_method_creator_('divide', False)),
+        ('__rdiv__', _binary_method_creator_('divide', True)),
+        ('__truediv__', _binary_method_creator_('divide', False)),
+        ('__rtruediv__', _binary_method_creator_('divide', True)),
+        ('__floordiv__', _binary_method_creator_('floor_divide', False)),
+        ('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
+        ('__mod__', _binary_method_creator_('remainder', False)),
+        #  for logical compare
+        ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
+        ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
+        ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
+        ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
+        ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
+        ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None))
+    ]
+
+    global _already_patch_variable
+    if not _already_patch_variable:
+        for method in variable_methods:
+            method_name = method[0]
+            method_impl = method[1]
+            setattr(Variable, method_name, method_impl)
+    else:
+        import paddle.tensor
+        for method_name in common_methods:
+            if hasattr(Variable, method_name): continue
+            method_impl = getattr(paddle.tensor, method_name, None)
+            if method_impl: setattr(Variable, method_name, method_impl)
+
+    _already_patch_variable = True
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
old mode 100644
new mode 100755
index 3595406a011ce8d4dc3864cff618a9e2c9c7c2c4..ec71e4c9912295ca0844ce91dd2e06e03d9a216d
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -26,7 +26,7 @@ import six
 import paddle
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program
+from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator
 from .. import dygraph_utils
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -35,6 +35,7 @@ from . import utils
 from .. import unique_name
 from functools import reduce
 from .. import core
+from ...utils import deprecated
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 import paddle
 from paddle.utils import deprecated
@@ -931,6 +932,7 @@ def cos_sim(X, Y):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.dropout")
 def dropout(x,
             dropout_prob,
             is_test=False,
@@ -938,9 +940,6 @@ def dropout(x,
             name=None,
             dropout_implementation="downgrade_in_infer"):
     """
-    :alias_main: paddle.nn.functional.dropout
-	:alias: paddle.nn.functional.dropout,paddle.nn.functional.common.dropout
-	:old_api: paddle.fluid.layers.dropout
 
     Computes dropout.
 
@@ -1188,6 +1187,7 @@ def chunk_eval(input,
             num_correct_chunks)
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.softmax")
 def softmax(input, use_cudnn=False, name=None, axis=-1):
     """
     This operator implements the softmax layer. The calculation process is as follows:
@@ -3362,6 +3362,15 @@ def data_norm(input,
         "BatchSum": batch_sum,
         "BatchSquareSum": batch_square_sum
     }
+    attrs = {
+        "epsilon": epsilon,
+        "sync_stats": sync_stats,
+        "summary_decay_rate": summary_decay_rate,
+    }
+    if slot_dim > 0:
+        attrs["slot_dim"] = slot_dim
+    if enable_scale_and_shift:
+        attrs["enable_scale_and_shift"] = enable_scale_and_shift
     if enable_scale_and_shift:
         inputs["scale_w"] = scale_w
         inputs["bias"] = bias
@@ -3376,13 +3385,7 @@ def data_norm(input,
             "BatchSum": batch_sum,
             "BatchSquareSum": batch_square_sum
         },
-        attrs={
-            "epsilon": epsilon,
-            "slot_dim": slot_dim,
-            "sync_stats": sync_stats,
-            "summary_decay_rate": summary_decay_rate,
-            "enable_scale_and_shift": enable_scale_and_shift
-        })
+        attrs=attrs)
 
     return helper.append_activation(data_norm_out)
 
@@ -4594,7 +4597,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
     Args:
         input (Variable): The input variable which is a Tensor, the data type is float32,
             float64, int32, int64.
-        dim (list|int, optional): The dimensions along which the product is performed. If
+        dim (int|list|tuple, optional): The dimensions along which the product is performed. If
             :attr:`None`, multiply all elements of :attr:`input` and return a
             Tensor variable with a single element, otherwise must be in the
             range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
@@ -4634,9 +4637,18 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_prod(y, dim=[0, 1]) # [105.0, 384.0]
     """
     helper = LayerHelper('reduce_prod', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
-        dim = [dim]
+        if isinstance(dim, tuple):
+            dim = list(dim)
+        elif isinstance(dim, int):
+            dim = [dim]
+        else:
+            raise TypeError(
+                "The type of axis must be int, list or tuple, but received {}".
+                format(type(dim)))
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'reduce_prod')
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     helper.append_op(
         type='reduce_prod',
         inputs={'X': input},
@@ -5024,6 +5036,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.matmul")
 def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
     """
     Applies matrix multiplication to two tensors.
@@ -5095,7 +5108,65 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
             y = fluid.layers.data(name='y', shape=[3, 2], dtype='float32')
             out = fluid.layers.matmul(x, y, True, True)
     """
-    return paddle.matmul(x, y, transpose_x, transpose_y, alpha, name)
+    attrs = {
+        'transpose_X': transpose_x,
+        'transpose_Y': transpose_y,
+        'alpha': float(alpha),
+    }
+
+    if in_dygraph_mode():
+        out = _varbase_creator(dtype=x.dtype)
+        core.ops.matmul(x, y, out, 'transpose_X', transpose_x, 'transpose_Y',
+                        transpose_y, 'alpha', float(alpha))
+        return out
+
+    def __check_input(x, y):
+        var_names = {'x': x, 'y': y}
+        for name, val in var_names.items():
+            check_variable_and_dtype(
+                val, name, ['float16', 'float32', 'float64'], 'matmul')
+        x_shape = list(x.shape)
+        y_shape = list(y.shape)
+        if len(x_shape) == 1:
+            x_shape = [1] + x_shape
+        if len(y_shape) == 1:
+            y_shape = y_shape + [1]
+
+        # check the inner 2 dimensions
+        if transpose_x:
+            x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2]
+        if transpose_y:
+            y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
+        if x_shape[-1] != y_shape[-2]:
+            assert (x_shape[-1] == -1) or (y_shape[-2] == -1),                         \
+                "After performing an optional transpose, Input X's width should be "   \
+                "equal to Y's width for multiplication "                               \
+                "prerequisites. But received X's shape: %s, Y's shape: %s\n" %         \
+                (x_shape, y_shape)
+
+        if len(y_shape) > 2 and len(x_shape) > 2:
+            for i, dim_x in enumerate(x_shape[:-2]):
+                # don't check neg shape
+                if dim_x < 0 or y_shape[i] < 0:
+                    continue
+                if dim_x != y_shape[i]:
+                    raise ValueError(
+                        "When the matrix is larger than 2 dimensions, the higher "
+                        "dimensional values of the two matrices need to be equal. "
+                        "But received x_shape[%d] != y_shape[%d]. X's shape: %s, "
+                        "Y's shape: %s.\n" % (i, i, x_shape, y_shape))
+
+    __check_input(x, y)
+
+    helper = LayerHelper('matmul', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='matmul',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': out},
+        attrs=attrs)
+    return out
 
 
 def topk(input, k, name=None):
@@ -5800,6 +5871,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     return loss
 
 
+@deprecated(since='2.0.0', update_to='paddle.nn.functional.one_hot')
 def one_hot(input, depth, allow_out_of_range=False):
     """
 
@@ -5963,7 +6035,6 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
     """
     :alias_main: paddle.reshape
 	:alias: paddle.reshape,paddle.tensor.reshape,paddle.tensor.manipulation.reshape
-	:old_api: paddle.fluid.layers.reshape
 
     This operator changes the shape of ``x`` without changing its data.
 
@@ -6006,14 +6077,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
         The parameter ``actual_shape`` will be deprecated in the future and only use ``shape`` instead to represent the target shape.
 
     Args:
-        x(Variable): A ``Tensor`` or ``LoDTensor`` . The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
-        shape(list|tuple|Variable): Define the target shape. At most one dimension of the target shape can be -1.
+        x(Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
+        shape(list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1.
                         The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
-                        If ``shape`` is an Variable, it should be an 1-D Tensor .
+                        If ``shape`` is an Tensor, it should be an 1-D Tensor .
         actual_shape(variable, optional): An 1-D ``Tensor`` or ``LoDTensor`` . The data type is ``int32`` . If provided, reshape
                                 according to this given shape rather than ``shape`` specifying shape.
                                 That is to say ``actual_shape`` has a higher priority
-                                than ``shape(list|tuple)`` but not ``shape(Variable)``. \
+                                than ``shape(list|tuple)`` but not ``shape(Tensor)``. \
                                 This argument ``actual_shape`` will be removed in a future version. \
                                 Instructions for updating: ``actual_shape`` will be removed in future versions and replaced by ``shape``.
         act (str, optional): The non-linear activation to be applied to the reshaped input. Default None.
@@ -6025,10 +6096,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                             For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Variable: A ``Tensor`` or ``LoDTensor``. The data type is same as ``x``. It is a new tensor variable if ``inplace`` is ``False``, otherwise it is ``x``. If ``act`` is None, return the reshaped tensor variable, otherwise return the activated tensor variable.
+        Tensor: A reshaped Tensor with the same data type as ``x``. It is a new tensor variable if ``inplace`` is ``False``, otherwise it is ``x``. If ``act`` is None, return the reshaped tensor variable, otherwise return the activated tensor variable.
 
     Raises:
-        TypeError: If actual_shape is neither Variable nor None.
+        TypeError: If actual_shape is neither Tensor nor None.
         ValueError: If more than one elements of ``shape`` is -1.
         ValueError: If the element of ``shape`` is 0, the corresponding dimension should be less than or equal to the dimension of ``x``.
         ValueError: If the elements in ``shape`` is negative except -1.
@@ -6039,7 +6110,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             import paddle.fluid as fluid
 
             # example 1:
-            # attr shape is a list which doesn't contain tensor Variable.
+            # attr shape is a list which doesn't contain Tensors.
             data_1 = fluid.data(
               name='data_1', shape=[2, 4, 6], dtype='float32')
             reshaped_1 = fluid.layers.reshape(
@@ -6047,7 +6118,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             # the shape of reshaped_1 is [2,4,3,2].
 
             # example 2:
-            # attr shape is a list which contains tensor Variable.
+            # attr shape is a list which contains Tensors.
             data_2 = fluid.layers.fill_constant([2,25], "int32", 3)
             dim = fluid.layers.fill_constant([1], "int32", 5)
             reshaped_2 = fluid.layers.reshape(data_2, shape=[dim, 10])
@@ -8139,9 +8210,9 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
     return image_resize(input=input, out_shape=out_shape, resample=resample)
 
 
+@deprecated(since="2.0.0", update_to="paddle.gather")
 def gather(input, index, overwrite=True):
     """
-    **Gather Layer**
 
     Output is obtained by gathering entries of the outer-most dimension
     of X indexed by `index` and concatenate them together.
@@ -8168,19 +8239,21 @@ def gather(input, index, overwrite=True):
                        [5, 6]]
 
     Args:
-        input (Variable): The source input tensor with rank>=1. Supported data type is
+        input (Tensor): The source input tensor with rank>=1. Supported data type is
             int32, int64, float32, float64 and uint8 (only for CPU),
             float16 (only for GPU).
-        index (Variable): The index input tensor with rank=1. Data type is int32 or int64.
+        index (Tensor): The index input tensor with rank=1. Data type is int32 or int64.
         overwrite (bool, optional): The mode that updating the grad when has same index.
             If True, use the overwrite mode to update the grad of the same index,
 	    if False, use the accumulate mode to update the grad of the same index.
 	    Default value is True.
 
-
-
     Returns:
-        output (Variable): The output is a tensor with the same rank as input.
+        output (Tensor): The output is a tensor with the same rank as input.
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must to be one of float16, float32, float64, int32, int64, uint8.
+        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
 
     Examples:
 
@@ -8191,6 +8264,13 @@ def gather(input, index, overwrite=True):
             index = fluid.data(name='index', shape=[-1, 1], dtype='int32')
             output = fluid.layers.gather(x, index)
     """
+    if in_dygraph_mode():
+        return core.ops.gather(input, index, None)
+
+    check_variable_and_dtype(
+        input, 'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'], 'gather')
+    check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
     helper = LayerHelper('gather', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
@@ -8203,6 +8283,7 @@ def gather(input, index, overwrite=True):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.gather_nd")
 def gather_nd(input, index, name=None):
     """
     **Gather Nd Layer**
@@ -8255,14 +8336,18 @@ def gather_nd(input, index, name=None):
                          = [23]
 
     Args:
-        input (Variable): The source input. Its dtype should be int32, int64, float32, float64.
-        index (Variable): The index input with rank > 1, index.shape[-1] <= input.rank.
-                          Its dtype should be int32, int64.
-        name (str|None): A name for this layer(optional). If set None, the
-                         layer will be named automatically.
+        input (Tensor): The input Tensor which it's data type should be bool, float32, float64, int32, int64.
+        index (Tensor): The index input with rank > 1, index.shape[-1] <= input.rank.
+                        Its dtype should be int32, int64.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+                        For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        output (Variable): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
+        output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
+    
+    Raises:
+        TypeError: ``input`` must be a Tensor and the data type of ``input`` must be one of float32, float64, int32 and int64.
+        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be one of int32 and int64.
 
     Examples:
 
@@ -8274,6 +8359,12 @@ def gather_nd(input, index, name=None):
             output = fluid.layers.gather_nd(x, index)
 
     """
+    if in_dygraph_mode():
+        return core.ops.gather_nd(input, index)
+    check_variable_and_dtype(input, 'input',
+                             ['bool', 'float32', 'float64', 'int32', 'int64'],
+                             'gather_np')
+    check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather_np')
     helper = LayerHelper('gather_nd', **locals())
     dtype = helper.input_dtype()
     output = helper.create_variable_for_type_inference(dtype)
@@ -8285,6 +8376,7 @@ def gather_nd(input, index, name=None):
     return output
 
 
+@deprecated(since="2.0.0", update_to="paddle.scatter")
 def scatter(input, index, updates, name=None, overwrite=True):
     """
     :alias_main: paddle.scatter
@@ -8600,7 +8692,7 @@ def log(x, name=None):
     return out
 
 
-@templatedoc()
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.relu")
 def relu(x, name=None):
     """
     ${comment}
@@ -8642,11 +8734,9 @@ def relu(x, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.selu")
 def selu(x, scale=None, alpha=None, name=None):
     """
-    :alias_main: paddle.nn.functional.selu
-	:alias: paddle.nn.functional.selu,paddle.nn.functional.activation.selu
-	:old_api: paddle.fluid.layers.selu
 
     Selu Operator.
 
@@ -9261,7 +9351,7 @@ def pad2d(input,
     return out
 
 
-@templatedoc()
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.elu")
 def elu(x, alpha=1.0, name=None):
     """
     :alias_main: paddle.nn.functional.elu
@@ -9303,12 +9393,9 @@ def elu(x, alpha=1.0, name=None):
     return out
 
 
-@templatedoc()
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.relu6")
 def relu6(x, threshold=6.0, name=None):
     """
-    :alias_main: paddle.nn.functional.relu6
-	:alias: paddle.nn.functional.relu6,paddle.nn.functional.activation.relu6
-	:old_api: paddle.fluid.layers.relu6
 
     ${comment}
 
@@ -9580,6 +9667,7 @@ def swish(x, beta=1.0, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.prelu")
 def prelu(x, mode, param_attr=None, name=None):
     """
     :api_attr: Static Graph
@@ -9708,13 +9796,10 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.leaky_relu")
 @templatedoc()
 def leaky_relu(x, alpha=0.02, name=None):
     """
-    :alias_main: paddle.nn.functional.leaky_relu
-	:alias: paddle.nn.functional.leaky_relu,paddle.nn.functional.activation.leaky_relu
-	:old_api: paddle.fluid.layers.leaky_relu
-
     ${comment}
     Args:
         x(${x_type}): ${x_comment}
@@ -9743,19 +9828,7 @@ def leaky_relu(x, alpha=0.02, name=None):
             res_val, = exe.run(fluid.default_main_program(), feed={'x':x_i}, fetch_list=[res])
             print(res_val) # [[-0.1, 2], [3, -0.4]]
     """
-    if in_dygraph_mode():
-        return core.ops.leaky_relu(x, 'alpha', alpha)
-
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'leaky_relu')
-
-    inputs = {'X': [x]}
-    attrs = {'alpha': alpha}
-    helper = LayerHelper('leaky_relu', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='leaky_relu', inputs=inputs, outputs={'Out': out}, attrs=attrs)
-    return out
+    return paddle.nn.functional.leaky_relu(x, alpha, name)
 
 
 def soft_relu(x, threshold=40.0, name=None):
@@ -10320,6 +10393,7 @@ def expand_as(x, target_tensor, name=None):
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
 
 
+@deprecated(since='1.8.0', update_to="paddle.uniform")
 @templatedoc()
 def uniform_random_batch_size_like(input,
                                    shape,
@@ -10415,6 +10489,7 @@ def uniform_random_batch_size_like(input,
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.normal")
 @templatedoc()
 def gaussian_random(shape,
                     mean=0.0,
@@ -10589,6 +10664,7 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
     return out
 
 
+@deprecated(since='1.8.0', update_to="paddle.normal")
 @templatedoc()
 def gaussian_random_batch_size_like(input,
                                     shape,
@@ -11177,6 +11253,7 @@ def rank(input):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.numel")
 def size(input):
     """
     **Size Layer**
@@ -11184,11 +11261,14 @@ def size(input):
     Returns the number of elements for a tensor, which is a int64 Tensor with shape [1].
 
     Args:
-        input (Variable): The input variable.
+        input (Tensor): The input Tensor, it's data type can be bool, float16, float32, float64, int32, int64.
 
     Returns:
-        Variable: The number of elements for the input variable.
+        Tensor: The number of elements for the input Tensor.
 
+    Raises:
+        TypeError: ``input`` must be a Tensor and the data type of ``input`` must be one of bool, float16, float32, float64, int32, int64.
+    
     Examples:
         .. code-block:: python
 
@@ -11199,6 +11279,11 @@ def size(input):
             rank = layers.size(input) # 300
     """
 
+    if in_dygraph_mode():
+        return core.ops.size(x)
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        "size")
     helper = LayerHelper('size', **locals())
     out = helper.create_variable_for_type_inference(dtype='int64')
     helper.append_op(type='size', inputs={'Input': input}, outputs={'Out': out})
@@ -11424,6 +11509,7 @@ Examples:
     return _elementwise_op(LayerHelper('elementwise_add', **locals()))
 
 
+@deprecated(since="2.0.0", update_to="paddle.divide")
 def elementwise_div(x, y, axis=-1, act=None, name=None):
     """
     :alias_main: paddle.elementwise_div
@@ -11847,6 +11933,7 @@ Examples:
     return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
 
 
+@deprecated(since="2.0.0", update_to="paddle.remainder")
 def elementwise_mod(x, y, axis=-1, act=None, name=None):
     """
     :alias_main: paddle.elementwise_mod
@@ -11884,6 +11971,7 @@ Examples:
     return _elementwise_op(LayerHelper('elementwise_mod', **locals()))
 
 
+@deprecated(since="2.0.0", update_to="paddle.floor_divide")
 def elementwise_floordiv(x, y, axis=-1, act=None, name=None):
     """
     :alias_main: paddle.elementwise_floordiv
@@ -11998,6 +12086,13 @@ Examples:
 
 
 def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
+    if in_dygraph_mode():
+        op = getattr(core.ops, op_name)
+        if binary_op:
+            return op(x, y)
+        else:
+            return op(x)
+
     check_variable_and_dtype(x, "x", ["bool"], op_name)
     if y is not None:
         check_variable_and_dtype(y, "y", ["bool"], op_name)
@@ -12022,28 +12117,27 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
     return out
 
 
-@templatedoc()
 def logical_and(x, y, out=None, name=None):
     """
-    :alias_main: paddle.logical_and
-    :alias: paddle.logical_and, paddle.tensor.logical_and, paddle.tensor.logic.logical_and
-    :old_api: paddle.fluid.layers.logical_and
 
-    ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
 
     .. math::
 
         out = x \&\& y
 
+    .. note::
+        ``paddle.logical_and`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
+
     Args:
-        x(${x_type}): ${x_comment}.
-        y(${y_type}): ${y_comment}.
-        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
+        x (Tensor): the input tensor, it's data type should be bool.
+        y (Tensor): the input tensor, it's data type should be bool.
+        out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        ${out_type}: ${out_comment}
+        N-D Tensor. A location into which the result is stored. It's dimension equals with ``x``.
 
     Examples:
         .. code-block:: python
@@ -12052,40 +12146,38 @@ def logical_and(x, y, out=None, name=None):
             import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([True, True, False, False], dtype=np.bool)
+            x_data = np.array([True], dtype=np.bool)
             y_data = np.array([True, False, True, False], dtype=np.bool)
             x = paddle.to_tensor(x_data)
             y = paddle.to_tensor(y_data)
             res = paddle.logical_and(x, y)
-            print(res.numpy()) # [True False False False]
+            print(res.numpy()) # [True False True False]
     """
-
     return _logical_op(
         op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True)
 
 
-@templatedoc()
 def logical_or(x, y, out=None, name=None):
     """
-    :alias_main: paddle.logical_or
-    :alias: paddle.logical_or, paddle.tensor.logical_or, paddle.tensor.logic.logical_or
-    :old_api: paddle.fluid.layers.logical_or
 
-    ``logical_or`` operator computes element-wise logical OR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    ``logical_or`` operator computes element-wise logical OR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
 
     .. math::
 
         out = x || y
 
+    .. note::
+        ``paddle.logical_or`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
+    
     Args:
-        x(${x_type}): ${x_comment}.
-        y(${y_type}): ${y_comment}.
-        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
+        x (Tensor): the input tensor, it's data type should be bool.
+        y (Tensor): the input tensor, it's data type should be bool.
+        out(Tensor): The ``Variable`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        ${out_type}: ${out_comment}
+        N-D Tensor. A location into which the result is stored. It's dimension equals with ``x``.
 
     Examples:
         .. code-block:: python
@@ -12094,40 +12186,38 @@ def logical_or(x, y, out=None, name=None):
             import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([True, True, False, False], dtype=np.bool)
-            y_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.to_variable(x_data)
-            y = paddle.to_variable(y_data)
+            x_data = np.array([True, False], dtype=np.bool).reshape(2, 1)
+            y_data = np.array([True, False, True, False], dtype=np.bool).reshape(2, 2)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
             res = paddle.logical_or(x, y)
-            print(res.numpy()) # [True  True  True False]
+            print(res.numpy()) # [[ True  True] [ True False]]
     """
-
     return _logical_op(
         op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True)
 
 
-@templatedoc()
 def logical_xor(x, y, out=None, name=None):
     """
-    :alias_main: paddle.logical_xor
-    :alias: paddle.logical_xor, paddle.tensor.logical_xor, paddle.tensor.logic.logical_xor
-    :old_api: paddle.fluid.layers.logical_xor
 
-    ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
 
     .. math::
 
         out = (x || y) \&\& !(x \&\& y)
 
+    .. note::
+        ``paddle.logical_xor`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
+
     Args:
-        x(${x_type}): ${x_comment}.
-        y(${y_type}): ${y_comment}.
-        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
+        x (Tensor): the input tensor, it's data type should be bool.
+        y (Tensor): the input tensor, it's data type should be bool.
+        out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        ${out_type}: ${out_comment}
+        N-D Tensor. A location into which the result is stored. It's dimension equals with ``x``.
 
     Examples:
         .. code-block:: python
@@ -12136,14 +12226,13 @@ def logical_xor(x, y, out=None, name=None):
             import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([True, True, False, False], dtype=np.bool)
-            y_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.to_variable(x_data)
-            y = paddle.to_variable(y_data)
+            x_data = np.array([True, False], dtype=np.bool).reshape([2, 1])
+            y_data = np.array([True, False, True, False], dtype=np.bool).reshape([2, 2])
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
             res = paddle.logical_xor(x, y)
-            print(res.numpy()) # [False  True  True False]
+            print(res.numpy()) # [[False,  True], [ True, False]]
     """
-
     return _logical_op(
         op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True)
 
@@ -12189,8 +12278,6 @@ def logical_not(x, out=None, name=None):
 @templatedoc()
 def clip(x, min, max, name=None):
     """
-    :alias_main: paddle.nn.clip
-	:alias: paddle.nn.clip,paddle.nn.clip.clip
 	:old_api: paddle.fluid.layers.clip
 
     ${comment}
@@ -12285,13 +12372,10 @@ def clip_by_norm(x, max_norm, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.mean")
 @templatedoc()
 def mean(x, name=None):
     """
-    :alias_main: paddle.mean
-	:alias: paddle.mean,paddle.tensor.mean,paddle.tensor.stat.mean
-	:old_api: paddle.fluid.layers.mean
-
     ${comment}
 
     Args:
@@ -13970,12 +14054,9 @@ def where(condition):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.sign")
 def sign(x):
     """
-    :alias_main: paddle.sign
-	:alias: paddle.sign,paddle.tensor.sign,paddle.tensor.math.sign
-	:old_api: paddle.fluid.layers.sign
-
     This OP returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
 
     Args:
@@ -14009,17 +14090,11 @@ def sign(x):
 
 def unique(x, dtype='int32'):
     """
-    :alias_main: paddle.unique
-	:alias: paddle.unique,paddle.tensor.unique,paddle.tensor.manipulation.unique
-	:old_api: paddle.fluid.layers.unique
-
-    **unique**
-
     Return a unique tensor for `x` and an index tensor pointing to this unique tensor.
 
     Args:
-        x(Variable): A 1-D input tensor.
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of index tensor: int32, int64.
+        x(Tensor): A 1-D input tensor, it's data type should be float32, float64, int32, int64.
+        dtype(np.dtype|str, optional): The type of index tensor: int32, int64. Default: int32.
 
     Returns:
         tuple: (out, index). `out` is the unique tensor for `x`, with identical dtype to `x`, and \
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 1a61072ace6ff2255411374f99d0ed03acbe02bc..84cacea6ba5723f8a06fc87fa9c59d96f802e65a 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -20,17 +20,19 @@ from ..framework import convert_np_dtype_to_dtype_, Variable
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from paddle.utils import deprecated
 
+__deprecated_func_name__ = {'tanh_shrink': 'tanhshrink', }
+
 __activations_noattr__ = [
     'sigmoid',
     'logsigmoid',
     'tanh_shrink',
     'softplus',
     'softsign',
+    'tanh',
 ]
 
 __unary_func__ = [
     'exp',
-    'tanh',
     'atan',
     'sqrt',
     'rsqrt',
@@ -64,14 +66,20 @@ __all__ += __activations_noattr__
 __all__ += __unary_func__
 
 for _OP in set(__activations_noattr__):
+    _new_OP = _OP
+    if _OP in __deprecated_func_name__:
+        _new_OP = __deprecated_func_name__[_OP]
     func = generate_activation_fn(_OP)
     func = deprecated(
-        since="2.0.0", update_to="paddle.nn.functional.%s" % (_OP))(func)
+        since="2.0.0", update_to="paddle.nn.functional.%s" % (_new_OP))(func)
     globals()[_OP] = func
 
 for _OP in set(__unary_func__):
+    _new_OP = _OP
+    if _OP in __deprecated_func_name__:
+        _new_OP = __deprecated_func_name__[_OP]
     func = generate_activation_fn(_OP)
-    func = deprecated(since="2.0.0", update_to="paddle.%s" % (_OP))(func)
+    func = deprecated(since="2.0.0", update_to="paddle.%s" % (_new_OP))(func)
     globals()[_OP] = func
 
 add_sample_code(globals()["sigmoid"], r"""
@@ -160,16 +168,14 @@ add_sample_code(globals()["tanh_shrink"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         import paddle.nn.functional as F
+        import numpy as np
+
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
-        out = F.tanh_shrink(x)
-        print(out.numpy())
-        # [-0.02005104 -0.00262468  0.00033201  0.00868739]
+        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
 
 """)
 
@@ -401,16 +407,14 @@ add_sample_code(globals()["softplus"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         import paddle.nn.functional as F
+        import numpy as np
+
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
-        out = F.softplus(x)
-        print(out.numpy())
-        # [0.51301525 0.59813887 0.74439666 0.85435524]
+        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
 
 """)
 
@@ -418,16 +422,14 @@ add_sample_code(globals()["softsign"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         import paddle.nn.functional as F
+        import numpy as np
+
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
-        out = F.softsign(x)
-        print(out.numpy())
-        # [-0.28571429 -0.16666667  0.09090909  0.23076923]
+        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
 
 """)
 
@@ -730,7 +732,7 @@ __all__ += ['erf']
 _erf_ = generate_layer_fn('erf')
 
 
-def erf(x):
+def erf(x, name=None):
     locals_var = locals().copy()
     kwargs = dict()
     for name, val in locals_var.items():
@@ -740,10 +742,6 @@ def erf(x):
 
 
 erf.__doc__ = """
-	:alias_main: paddle.erf
-	:alias: paddle.erf,paddle.tensor.erf,paddle.tensor.math.erf,paddle.nn.functional.erf,paddle.nn.functional.activation.erf
-	:old_api: paddle.fluid.layers.erf
-
 :strong:`Erf Operator`
 For more details, see [Error function](https://en.wikipedia.org/wiki/Error_function).
 
@@ -753,57 +751,22 @@ Equation:
 
 Args:
 
-    x(Variable): The input of Erf op, Tensor or LoDTensor, dtype: float32 or float64.
+    x (Tensor): The input tensor, it's data type should be float32, float64.
 
 Returns:
 
-    Variable: The output of Erf op, Tensor or LoDTensor, dtype: float32 or float64, the same as the input, shape: the same as the input.
+    Tensor: The output of Erf op, dtype: float32 or float64, the same as the input, shape: the same as the input.
 
 Examples:
     
     .. code-block:: python
     
-        # declarative mode
-        import numpy as np
-        from paddle import fluid
-        
-        x = fluid.data(name="x", shape=(-1, 3), dtype="float32")
-        y = fluid.layers.erf(x)
-        
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        start = fluid.default_startup_program()
-        main = fluid.default_main_program()
-        
-        data = np.random.randn(2, 3).astype("float32")
-        exe.run(start)
-        
-        y_np, = exe.run(main, feed={"x": data}, fetch_list=[y])
-        
-        data
-        # array([[ 0.4643714 , -1.1509596 ,  1.2538221 ],
-        #        [ 0.34369683,  0.27478245,  1.1805398 ]], dtype=float32)
-        y_np
-        # array([[ 0.48863927, -0.8964121 ,  0.9237998 ],
-        #        [ 0.37307587,  0.30242872,  0.9049887 ]], dtype=float32)
-
-    .. code-block:: python
-    
-        # imperative mode
         import numpy as np
-        from paddle import fluid
-        import paddle.fluid.dygraph as dg
-        
-        data = np.random.randn(2, 3).astype("float32")
-        place = fluid.CPUPlace()
-        with dg.guard(place) as g:
-            x = dg.to_variable(data)
-            y = fluid.layers.erf(x)
-            y_np = y.numpy()
-        data
-        # array([[ 0.4643714 , -1.1509596 ,  1.2538221 ],
-        #        [ 0.34369683,  0.27478245,  1.1805398 ]], dtype=float32)
-        y_np
-        # array([[ 0.48863927, -0.8964121 ,  0.9237998 ],
-        #        [ 0.37307587,  0.30242872,  0.9049887 ]], dtype=float32)
+        import paddle
+        paddle.disable_static()
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_tensor(x_data)
+        out = paddle.erf(x)
+        print(out.numpy())
+        # [-0.42839236 -0.22270259  0.11246292  0.32862676]
 """
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index bc1368b562d7b354ce34dc87679fd8a0c5a3d012..fe8ed83923e88be2a0c98a8a539f26500b43b7cb 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -38,6 +38,7 @@ __all__ = [
     'Decoder',
     'BeamSearchDecoder',
     'rnn',
+    'birnn',
     'dynamic_decode',
     'DecodeHelper',
     'TrainingHelper',
@@ -438,61 +439,146 @@ def rnn(cell,
         is_reverse=False,
         **kwargs):
     """
-	:api_attr: Static Graph
-
     rnn creates a recurrent neural network specified by RNNCell `cell`,
-    which performs :code:`cell.call()` repeatedly until reaches to the maximum
-    length of `inputs`.
-
-    Parameters:
-        cell(RNNCell): An instance of `RNNCell`.
-        inputs(Variable): A (possibly nested structure of) tensor variable[s]. 
-            The shape of tensor should be `[batch_size, sequence_length, ...]`
-            for `time_major == False` or `[sequence_length, batch_size, ...]`
-            for `time_major == True`. It represents the inputs to be unrolled
-            in RNN.
-        initial_states(Variable, optional): A (possibly nested structure of)
-            tensor variable[s], representing the initial state for RNN. 
-            If not provided, `cell.get_initial_states` would be used to produce
-            the initial state. Default None.
-        sequence_length(Variable, optional): A tensor with shape `[batch_size]`.
-            It stores real length of each instance, thus enables users to extract
-            the last valid state when past a batch element's sequence length for
-            correctness. If not provided, the paddings would be treated same as
-            non-padding inputs. Default None.
-        time_major(bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
-        is_reverse(bool, optional): Indicate whether to calculate in the reverse
-            order of input sequences. Default: `False`.
-        **kwargs: Additional keyword arguments. Arguments passed to `cell.call`. 
+    which performs :code:`cell.call()` (for dygraph mode :code:`cell.forward`) 
+    repeatedly until reaches to the maximum length of `inputs`.
+
+    Arguments:
+        cell(RNNCellBase): An instance of `RNNCellBase`.
+        inputs(Tensor): the input sequences. 
+            If time_major is True, the shape is 
+            `[time_steps, batch_size, input_size]`
+            else the shape is `[batch_size, time_steps, input_size]`.
+        initial_states(Tensor|tuple|list, optional): the initial state of the 
+            rnn cell. Tensor or a possibly nested structure of tensors. If not 
+            provided, `cell.get_initial_states` would be called to produce
+            the initial state. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Defaults to False.
+        **kwargs: Additional keyword arguments to pass to `forward` of the cell. 
 
     Returns:
-        tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
-            outputs and states, both are Tensor or nested structure of Tensor. \
-            `final_outputs` has the same structure and data types as \
-            the returned `outputs` of :code:`cell.call` , and each Tenser in `final_outputs` \
-            stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
-            for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
-            `final_states` is the counterpart at last time step of initial states, \
-            thus has the same structure with it and has tensors with same shapes \
-            and data types.
+        (outputs, final_states)
+        outputs (Tensor|list|tuple): the output sequence. Tensor or nested 
+            structure of Tensors.
+            If `time_major` is True, the shape of each tensor in outpus is 
+            `[time_steps, batch_size, hidden_size]`, else 
+            `[batch_size, time_steps, hidden_size]`.
+        final_states (Tensor|list|tuple): final states. A (possibly nested structure of)
+            tensor[s], representing the final state for RNN. It has the same 
+            structure of intial state. Each tensor in final states has the same
+            shape and dtype as the corresponding tensor in initial states.
             
 
     Examples:
 
         .. code-block:: python
-            
-            import paddle.fluid as fluid
 
-            inputs = fluid.data(name="inputs",
-                                shape=[-1, 32, 128],
-                                dtype="float32")
-            cell = fluid.layers.GRUCell(hidden_size=128)
-            outputs = fluid.layers.rnn(cell=cell, inputs=inputs)
+            import paddle
+            paddle.disable_static()
+
+            cell = paddle.nn.SimpleRNNCell(16, 32)
+
+            inputs = paddle.rand((4, 23, 16))
+            prev_h = paddle.randn((4, 32))
+            outputs, final_states = paddle.nn.functional.rnn(cell, inputs, prev_h) 
+
     """
+    if in_dygraph_mode():
+        return _rnn_dynamic_graph(cell, inputs, initial_states, sequence_length,
+                                  time_major, is_reverse, **kwargs)
+    else:
+        return _rnn_static_graph(cell, inputs, initial_states, sequence_length,
+                                 time_major, is_reverse, **kwargs)
+
+
+class ArrayWrapper(object):
+    def __init__(self, x):
+        self.array = [x]
+
+    def append(self, x):
+        self.array.append(x)
+        return self
+
+
+def _maybe_copy(state, new_state, step_mask):
+    """update rnn state or just pass the old state through"""
+    new_state = nn.elementwise_mul(new_state, step_mask, axis=0) \
+              + nn.elementwise_mul(state, (1 - step_mask), axis=0)
+    return new_state
+
+
+def _transpose_batch_time(x):
+    perm = [1, 0] + list(range(2, len(x.shape)))
+    return nn.transpose(x, perm)
+
+
+def _rnn_dynamic_graph(cell,
+                       inputs,
+                       initial_states=None,
+                       sequence_length=None,
+                       time_major=False,
+                       is_reverse=False,
+                       **kwargs):
+    time_step_index = 0 if time_major else 1
+    flat_inputs = flatten(inputs)
+    time_steps = flat_inputs[0].shape[time_step_index]
+
+    if not time_major:
+        inputs = map_structure(_transpose_batch_time, inputs)
+
+    if sequence_length is not None:
+        mask = sequence_lod.sequence_mask(
+            sequence_length, maxlen=time_steps, dtype=inputs.dtype)
+        mask = nn.transpose(mask, [1, 0])
+
+    if is_reverse:
+        inputs = map_structure(lambda x: tensor.reverse(x, axis=[0]), inputs)
+        mask = tensor.reverse(mask, axis=[0]) \
+            if sequence_length is not None else None
+
+    states = initial_states
+    outputs = []
+    for i in range(time_steps):
+        step_inputs = map_structure(lambda x: x[i], inputs)
+        step_outputs, new_states = cell(step_inputs, states, **kwargs)
+        if sequence_length is not None:
+            new_states = map_structure(
+                partial(
+                    _maybe_copy, step_mask=mask[i]), states, new_states)
+        states = new_states
+        outputs = map_structure(lambda x: ArrayWrapper(x),
+                                step_outputs) if i == 0 else map_structure(
+                                    lambda x, x_array: x_array.append(x),
+                                    step_outputs, outputs)
+
+    final_outputs = map_structure(
+        lambda x: nn.stack(x.array, axis=time_step_index),
+        outputs)
+
+    if is_reverse:
+        final_outputs = map_structure(
+            lambda x: tensor.reverse(x, axis=time_step_index),
+            final_outputs)
+
+    final_states = new_states
+    return final_outputs, final_states
+
+
+def _rnn_static_graph(cell,
+                      inputs,
+                      initial_states=None,
+                      sequence_length=None,
+                      time_major=False,
+                      is_reverse=False,
+                      **kwargs):
     check_type(inputs, 'inputs', (Variable, list, tuple), 'rnn')
     if isinstance(inputs, (list, tuple)):
         for i, input_x in enumerate(inputs):
@@ -500,30 +586,10 @@ def rnn(cell,
                                      ['float32', 'float64'], 'rnn')
     check_type(initial_states, 'initial_states',
                (Variable, list, tuple, type(None)), 'rnn')
-    if isinstance(initial_states, (list, tuple)):
-        states = map_structure(lambda x: x, initial_states)[0]
-        for i, state in enumerate(states):
-            if isinstance(state, (list, tuple)):
-                for j, state_j in enumerate(state):
-                    check_variable_and_dtype(state_j, 'state_j[' + str(j) + ']',
-                                             ['float32', 'float64'], 'rnn')
-            else:
-                check_variable_and_dtype(state, 'states[' + str(i) + ']',
-                                         ['float32', 'float64'], 'rnn')
 
     check_type(sequence_length, 'sequence_length', (Variable, type(None)),
                'rnn')
 
-    def _maybe_copy(state, new_state, step_mask):
-        # TODO: use where_op
-        new_state = nn.elementwise_mul(
-            new_state, step_mask, axis=0) - nn.elementwise_mul(
-                state, (step_mask - 1), axis=0)
-        return new_state
-
-    def _transpose_batch_time(x):
-        return nn.transpose(x, [1, 0] + list(range(2, len(x.shape))))
-
     def _switch_grad(x, stop=False):
         x.stop_gradient = stop
         return x
@@ -582,6 +648,98 @@ def rnn(cell,
     return (final_outputs, final_states)
 
 
+def birnn(cell_fw,
+          cell_bw,
+          inputs,
+          initial_states=None,
+          sequence_length=None,
+          time_major=False,
+          **kwargs):
+    """
+    birnn creates a bidirectional recurrent neural network specified by 
+    RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()` 
+    (for dygraph mode :code:`cell.forward`) repeatedly until reaches to 
+    the maximum length of `inputs` and then concat the ouputs for both RNNs
+    along the last axis.
+
+    Arguments:
+        cell_fw(RNNCellBase): An instance of `RNNCellBase`.
+        cell_bw(RNNCellBase): An instance of `RNNCellBase`.
+        inputs(Tensor): the input sequences. 
+            If time_major is True, the shape is 
+            `[time_steps, batch_size, input_size]`
+            else the shape is `[batch_size, time_steps, input_size]`.
+        initial_states(tuple, optional): A tuple of initial states of 
+            `cell_fw` and `cell_bw`.
+            If not provided, `cell.get_initial_states` would be called to 
+            produce initial state for each cell. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+        **kwargs: Additional keyword arguments to pass to `forward` of each cell. 
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor): the outputs of the bidirectional RNN. It is the 
+            concatenation of the outputs from the forward RNN and backward 
+            RNN along the last axis. 
+            If time major is True, the shape is `[time_steps, batch_size, size]`,
+            else the shape is `[batch_size, time_steps, size]`, where size is
+            `cell_fw.hidden_size + cell_bw.hidden_size`.
+        final_states (tuple): A tuple of the final states of the forward 
+            cell and backward cell.        
+
+    Examples:
+
+        .. code-block:: python
+            
+            import paddle
+            paddle.disable_static()
+
+            cell_fw = paddle.nn.LSTMCell(16, 32)
+            cell_bw = paddle.nn.LSTMCell(16, 32)
+
+            inputs = paddle.rand((4, 23, 16))
+            hf, cf = paddle.rand((4, 32)), paddle.rand((4, 32))
+            hb, cb = paddle.rand((4, 32)), paddle.rand((4, 32))
+            initial_states = ((hf, cf), (hb, cb))
+            outputs, final_states = paddle.nn.functional.birnn(
+                cell_fw, cell_bw, inputs, initial_states)
+        
+    """
+    if initial_states is None:
+        states_fw = cell_fw.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0)
+        states_bw = cell_fw.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0)
+    else:
+        states_fw, states_bw = initial_states
+    outputs_fw, states_fw = rnn(cell_fw,
+                                inputs,
+                                states_fw,
+                                sequence_length,
+                                time_major=time_major,
+                                **kwargs)
+
+    outputs_bw, states_bw = rnn(cell_bw,
+                                inputs,
+                                states_bw,
+                                sequence_length,
+                                time_major=time_major,
+                                is_reverse=True,
+                                **kwargs)
+
+    outputs = map_structure(lambda x, y: tensor.concat([x, y], -1), outputs_fw,
+                            outputs_bw)
+
+    final_states = (states_fw, states_bw)
+    return outputs, final_states
+
+
 class Decoder(object):
     """
 	:api_attr: Static Graph
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 7ac67b1bc817964ca65d5b7009b446458d2cc7ab..77a78eb4a14a0a5ad9be9cff71131ca473106ab8 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -26,6 +26,7 @@ from .. import core
 from .layer_function_generator import templatedoc
 from . import utils
 from ..data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
+from paddle.utils import deprecated
 import numpy
 import warnings
 
@@ -642,7 +643,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
         shape(list|tuple|Tensor): Shape of the output Tensor, the data type of ``shape`` is int32 or int64.
             If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
             If ``shape`` is an Tensor, it should be an 1-D Tensor with date type int32 or int64.
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output Tensor which can
+        dtype(np.dtype|str): Data type of the output Tensor which can
             be float16, float32, float64, int32, int64.
         value(bool|float|int|Tensor): The constant value used to initialize 
             the Tensor to be created. If ``value`` is an Tensor, it should be an 1-D Tensor.
@@ -746,6 +747,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     return out
 
 
+@deprecated(since='1.8.0', update_to="paddle.fill_constant")
 @templatedoc()
 def fill_constant_batch_size_like(input,
                                   shape,
@@ -1040,7 +1042,7 @@ def ones(shape, dtype, force_cpu=False):
 
     Parameters:
         shape(tuple|list|Tensor): Shape of output Tensor, the data type of shape is int32 or int64.
-        dtype (np.dtype|core.VarDesc.VarType|str): Data type of output Tensor, it supports
+        dtype (np.dtype|str): Data type of output Tensor, it supports
             bool, float16, float32, float64, int32 and int64.
         force_cpu (bool, optional): Whether force to store the output Tensor in CPU memory.
             If :attr:`force_cpu` is False, the output Tensor will be stored in running device memory.
@@ -1073,7 +1075,7 @@ def zeros(shape, dtype, force_cpu=False, name=None):
 
     Parameters:
         shape(tuple|list|Tensor): Shape of output Tensor, the data type of ``shape`` is int32 or int64.
-        dtype (np.dtype|core.VarDesc.VarType|str): Data type of output Tensor, it supports
+        dtype (np.dtype|str): Data type of output Tensor, it supports
             bool, float16, float32, float64, int32 and int64.
         force_cpu (bool, optional): Whether force to store the output Tensor in CPU memory.
             If :attr:`force_cpu` is False, the output Tensor will be stored in running device memory.
@@ -1435,14 +1437,14 @@ def linspace(start, stop, num, dtype=None, name=None):
     This OP return fixed number of evenly spaced values within a given interval.
 
     Args:
-        start(float|Tensor): The input :attr:`start` is start variable of range. It is a float scalar, \
-            or a Tensor of shape [1] with input data type float32, float64.
-        stop(float|Tensor): The input :attr:`stop` is start variable of range. It is a float scalar, \
-            or a Tensor of shape [1] with input data type float32, float64.
+        start(int|float|Tensor): The input :attr:`start` is start variable of range. It is a scalar, \
+            or a Tensor of shape [1] with input data type int32, int64, float32 or float64.
+        stop(int|float|Tensor): The input :attr:`stop` is start variable of range. It is a scalar, \
+            or a Tensor of shape [1] with input data type int32, int64, float32 or float64.
         num(int|Tensor): The input :attr:`num` is given num of the sequence. It is an int scalar, \
-            or a Tensor of shape [1] with data type int32.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of output tensor, it could be 'float32' and 'float64'.
-            Default: if None, the data type is float32.
+            or a Tensor of shape [1] with data type int32 or int64.
+        dtype(np.dtype|str, optional): The data type of output tensor, it could be
+            int32, int64, float32 and float64. Default: if None, the data type is float32.
         name(str, optional): Normally there is no need for user to set this property. 
             For more information, please refer to :ref:`api_guide_Name`.Default: None.
 
@@ -1452,9 +1454,11 @@ def linspace(start, stop, num, dtype=None, name=None):
         the value with input :attr:`start`. 
 
     Raises:
-        TypeError: The ``dtype`` must be one of float32 and float64.
-        TypeError: The data type of ``start`` and ``stop``  must be one of float32 and float64.
-        TypeError: The data type of ``num`` must be one of int32 and int64.
+        TypeError: The ``dtype`` must be one of int32, int64, float32 and float64.
+        TypeError: The type of ``num`` must be int When it's not a Tensor.
+        TypeError: The data type of ``num`` must be int32  When it's  a Tensor.
+        TypeError: The data type of ``start`` and  ``stop`` must be same as ``dtype`` When it's  a Tensor.
+
 
 
     Examples:
@@ -1467,29 +1471,47 @@ def linspace(start, stop, num, dtype=None, name=None):
     """
     if dtype is None:
         dtype = 'float32'
+    tensor_num = num
+    tensor_start = start
+    tensor_stop = stop
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
     if not isinstance(start, Variable):
-        start = fill_constant([1], dtype, start)
+        tensor_start = fill_constant([1], dtype, start)
     if not isinstance(stop, Variable):
-        stop = fill_constant([1], dtype, stop)
+        tensor_stop = fill_constant([1], dtype, stop)
     if not isinstance(num, Variable):
-        num = fill_constant([1], 'int32', num)
+        tensor_num = fill_constant([1], 'int32', num)
     if in_dygraph_mode():
-        return core.ops.linspace(start, stop, num)
+        return core.ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
+                                 dtype)
 
     helper = LayerHelper("linspace", **locals())
 
-    check_dtype(start.dtype, 'start', ['float32', 'float64'], 'linspace')
-    check_dtype(stop.dtype, 'stop', ['float32', 'float64'], 'linspace')
-    check_dtype(num.dtype, 'num', ['int32', 'int64'], 'linspace')
-    check_dtype(dtype, 'dtype', ['float32', 'float64'], 'linspace')
+    if isinstance(start, Variable):
+        check_dtype(start.dtype, 'start', (convert_dtype(dtype)), 'linspace')
+    else:
+        check_type(start, 'start', (int, float), 'linspace')
 
-    out = helper.create_variable_for_type_inference(dtype=start.dtype)
+    if isinstance(stop, Variable):
+        check_dtype(stop.dtype, 'stop', (convert_dtype(dtype)), 'linspace')
+    else:
+        check_type(stop, 'stop', (int, float), 'linspace')
+    if isinstance(num, Variable):
+        check_dtype(num.dtype, 'num', ['int32'], 'linspace')
+    else:
+        check_type(num, 'num', (int), 'linspace')
+    check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
+                'linspace')
+
+    out = helper.create_variable_for_type_inference(dtype=dtype)
 
     helper.append_op(
         type='linspace',
-        inputs={'Start': start,
-                'Stop': stop,
-                'Num': num},
+        inputs={'Start': tensor_start,
+                'Stop': tensor_stop,
+                'Num': tensor_num},
+        attrs={'dtype': dtype},
         outputs={'Out': [out]})
     return out
 
@@ -1537,6 +1559,7 @@ def zeros_like(x, out=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.diag")
 def diag(diagonal):
     """
 	:alias_main: paddle.diag
@@ -1598,7 +1621,7 @@ def eye(num_rows,
             If None, default: num_rows.
         batch_shape(list, optional): If provided, the returned tensor will have a leading
             batch size of this shape, the data type of ``batch_shape`` is int. Default is None.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of the returned tensor.
+        dtype(np.dtype|str, optional): The data type of the returned tensor.
             It should be int32, int64, float16, float32, float64, default is 'float32'.
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index d7e6a9d211b8bba3003ff9446d45f58c07e81864..8f34576b836a5412a6792a6dfd63b3c9fd8de560 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -40,6 +40,7 @@ from paddle.fluid.layers import tensor
 from functools import reduce
 from .wrapped_decorator import signature_safe_contextmanager
 from .. import compat as cpt
+import paddle
 
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
@@ -67,14 +68,16 @@ class Optimizer(object):
                  regularization=None,
                  grad_clip=None,
                  name=None):
+        # Because of the loop import, so place it in the function body
+        from paddle.optimizer.lr_scheduler import _LRScheduler
         self._parameter_list = list(
             parameter_list) if parameter_list is not None else None
         self._name = name
         if framework.in_dygraph_mode():
-            if not isinstance(learning_rate, float) and \
-                    not isinstance(learning_rate, LearningRateDecay):
+            if not isinstance(learning_rate,
+                              (float, LearningRateDecay, _LRScheduler)):
                 raise TypeError(
-                    "learning rate should be float or LearningRateDecay, got %s here"
+                    "learning rate should be float or _LRScheduler, got %s here"
                     % type(learning_rate))
             if self._parameter_list is None:
                 raise AttributeError(
@@ -89,11 +92,11 @@ class Optimizer(object):
                             % regularization.__str__())
                         break
         else:
-            if not isinstance(learning_rate, float) and \
-                    not isinstance(learning_rate, framework.Variable):
+            if not isinstance(learning_rate,
+                              (float, framework.Variable, _LRScheduler)):
                 raise TypeError(
-                    "learning rate should be float or Variable, got %s here" %
-                    type(learning_rate))
+                    "learning rate should be float or _LRScheduler, got %s here"
+                    % type(learning_rate))
 
         if grad_clip is not None:
             if not isinstance(grad_clip, GradientClipBase):
@@ -143,11 +146,15 @@ class Optimizer(object):
                     state_dict = adam.state_dict()
 
         '''
+        from paddle.optimizer.lr_scheduler import _LRScheduler
         state_dict = {}
         for k, v in self._accumulators.items():
             for para_name, var_tmp in v.items():
                 state_dict[var_tmp.name] = var_tmp
         # global step if use lr decay
+        if isinstance(self._learning_rate, _LRScheduler):
+            state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
+            return state_dict
         if isinstance(self._learning_rate, LearningRateDecay):
             state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
 
@@ -191,6 +198,9 @@ class Optimizer(object):
                     adam.set_dict(opti_state_dict)
 
         '''
+        from paddle.optimizer.lr_scheduler import _LRScheduler
+        if isinstance(self._learning_rate, _LRScheduler):
+            self._learning_rate.set_dict(state_dict["LR_Scheduler"])
 
         if isinstance(self._learning_rate, LearningRateDecay):
             self._learning_rate.set_dict(state_dict["LR_Scheduler"])
@@ -251,6 +261,30 @@ class Optimizer(object):
         return self._opti_name_list
 
     def _create_global_learning_rate(self):
+        from paddle.optimizer.lr_scheduler import _LRScheduler
+        if isinstance(self._learning_rate, _LRScheduler):
+            lr_var = self._global_learning_rate()
+            # only create global lr_var once
+            if not isinstance(lr_var, framework.Variable):
+                lr_name = unique_name.generate('learning_rate')
+                self._learning_rate._var_name = lr_name
+                lr_var = self.helper.create_global_variable(
+                    name=lr_name,
+                    shape=[1],
+                    persistable=True,
+                    stop_gradient=True,
+                    dtype='float32' if self._dtype is None else self._dtype)
+                main_prog = framework.default_main_program()
+                main_prog.lr_sheduler = self._learning_rate
+                main_prog.lr_var = lr_var
+                self._learning_rate_map[framework.default_main_program(
+                )] = lr_var
+
+            lr_value = float(self._learning_rate())
+            self.helper.set_variable_initializer(
+                lr_var, initializer=Constant(value=lr_value))
+            return
+
         if imperative_base.enabled():
             # create learning rate Variable
             if isinstance(self._learning_rate, float):
@@ -1141,7 +1175,7 @@ class MomentumOptimizer(Optimizer):
 
 class DGCMomentumOptimizer(Optimizer):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887
 
@@ -3067,7 +3101,7 @@ Lamb = LambOptimizer
 
 class ModelAverage(Optimizer):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     The ModelAverage optimizer accumulates specific continuous historical parameters
     during training. The accumulated historical range can be controlled by the passed
@@ -3376,7 +3410,7 @@ class ModelAverage(Optimizer):
 
 class ExponentialMovingAverage(object):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     Compute the moving average of parameters with exponential decay.
     Given a parameter :math:`\\theta`, its exponential moving average (EMA)
@@ -3626,7 +3660,7 @@ class ExponentialMovingAverage(object):
 
 class PipelineOptimizer(object):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     Pipeline Optimizer: Make a program to run as pipeline, that is splitting a
     program into multiple sections (sub-programs) and each section run on a
@@ -3690,7 +3724,8 @@ class PipelineOptimizer(object):
     def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
         if framework.in_dygraph_mode():
             raise Exception("In dygraph, don't support PipelineOptimizer.")
-        if not isinstance(optimizer, Optimizer):
+        if not isinstance(optimizer, Optimizer) and not isinstance(
+                optimizer, paddle.optimizer.Optimizer):
             raise ValueError("The 'optimizer' parameter for "
                              "PipelineOptimizer must be an instance of "
                              "Optimizer, but the given type is {}.".format(
@@ -4477,7 +4512,7 @@ class PipelineOptimizer(object):
 
 class RecomputeOptimizer(Optimizer):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     Recompute Optimizer Wrapper
 
@@ -4562,7 +4597,7 @@ class RecomputeOptimizer(Optimizer):
 
     def load(self, stat_dict):
         """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
         load function is not supported by Recompute Optimizer for now.
         :return: None
@@ -4786,7 +4821,7 @@ class RecomputeOptimizer(Optimizer):
 
 class LookaheadOptimizer(object):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     This implements the Lookahead optimizer of the
     paper : https://arxiv.org/abs/1907.08610.
@@ -4929,6 +4964,11 @@ class LookaheadOptimizer(object):
 
             mod = layers.elementwise_mod(step, k)
             with layers.control_flow.Switch() as switch:
+                with switch.case(step == one_var):
+                    for param_name in params:
+                        fast_var = main_block.var(param_name)
+                        slow_var = param_to_slow[param_name]
+                        layers.assign(input=fast_var, output=slow_var)
                 with switch.case(mod == zero_var):
                     for param_name in params:
                         fast_var = main_block.var(param_name)
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index a45443632b04835bc8f3b3f2f167433c7a8b49d4..8e0470bededd4fdb8aec03893590bdba35bbb364 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -204,6 +204,9 @@ class WeightNormParamAttr(ParamAttr):
     """
 	:api_attr: Static Graph
 
+    Note:
+        Please use 'paddle.nn.utils.weight_norm' in dygraph mode.
+
     Parameter of weight Norm. Weight Norm is a reparameterization of the weight vectors
     in a neural network that decouples the magnitude of those weight vectors from
     their direction. Weight Norm has been implemented as discussed in this
@@ -216,6 +219,7 @@ class WeightNormParamAttr(ParamAttr):
         It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient. 
         There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , 
         :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
+        
 
     Args:
         dim(int): Dimension over which to compute the norm. Dim is a non-negative
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 7e633756fce64a8f2f85be596cf11904d285c49a..76c95be75d67d60cd59efe13ecba6f01a1c1d614 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1039,7 +1039,7 @@ class GeneratorLoader(DataLoaderBase):
         self._reader = core.create_py_reader(
             self.queue, self._var_names, self._shapes, self._dtypes,
             self._need_check_feed, self._places, self._use_double_buffer,
-            self._drop_last, True)
+            self._drop_last, False)
 
     def _init_non_iterable(self):
         lod_levels = []
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 126b4465eae480d1d012eb706667b125dce5f0ea..6220bf62c79c30737f923e744d5670818f54ff6e 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -13,6 +13,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
 list(APPEND DIST_TEST_OPS test_listen_and_serv_op)
+list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -33,8 +34,9 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint)
 list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_base)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_2)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer)
@@ -44,6 +46,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
 foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
@@ -53,6 +56,14 @@ if(NOT WITH_GPU OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_allgather)
     LIST(REMOVE_ITEM TEST_OPS test_allreduce)
     LIST(REMOVE_ITEM TEST_OPS test_broadcast)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_reduce)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_scatter)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_reduce_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_scatter_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_barrier_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_allreduce_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_broadcast_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_allgather_api)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter_api)
 endif()
@@ -90,10 +101,16 @@ endif()
 
 
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint1)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint2)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint3)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint_multiple)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint_dist_basic)
+LIST(REMOVE_ITEM TEST_OPS test_hdfs1)
+LIST(REMOVE_ITEM TEST_OPS test_hdfs2)
+LIST(REMOVE_ITEM TEST_OPS test_hdfs3)
 LIST(REMOVE_ITEM TEST_OPS test_checkpoint_saver)
 if(APPLE OR WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_hdfs)
     LIST(REMOVE_ITEM TEST_OPS test_fs_interface)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_metric)
 endif()
@@ -235,6 +252,51 @@ function(bash_test_modules TARGET_NAME)
     endif()
 endfunction()
 
+function(parallel_bash_test_modules TARGET_NAME)
+    if(NOT WITH_TESTING)
+        return()
+    endif()
+
+    set(options SERIAL)
+    set(oneValueArgs TIMEOUT START_BASH)
+    set(multiValueArgs DEPS ENVS LABELS UnitTests)
+    cmake_parse_arguments(parallel_bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+
+    set(timeout 120)
+    if(${parallel_bash_test_modules_TIMEOUT})
+        set(timeout ${parallel_bash_test_modules_TIMEOUT})
+    endif()
+
+    list(JOIN  parallel_bash_test_modules_UnitTests " " uts_string)
+
+    if(WITH_COVERAGE)
+        add_test(NAME ${TARGET_NAME}
+            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
+            TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${parallel_bash_test_modules_ENVS} UnitTests=${uts_string}
+            WITH_COVERAGE=ON COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
+            bash ${CMAKE_CURRENT_BINARY_DIR}/${parallel_bash_test_modules_START_BASH}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    else()
+        add_test(NAME ${TARGET_NAME}
+            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
+            TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${parallel_bash_test_modules_ENVS} UnitTests=${uts_string}
+            bash ${CMAKE_CURRENT_BINARY_DIR}/${parallel_bash_test_modules_START_BASH}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    endif()
+
+    if (parallel_bash_test_modules_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
+
+    if(parallel_bash_test_modules_LABELS)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT ${timeout} LABELS ${parallel_bash_test_modules_LABELS})
+    else()
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT ${timeout})
+    endif()
+endfunction()
+
+
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_profiler)
@@ -270,6 +332,8 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
 list(REMOVE_ITEM TEST_OPS test_sampling_id_op)
+
+
 if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_dataset)
   list(REMOVE_ITEM TEST_OPS test_dataset_dataloader)
@@ -283,6 +347,7 @@ if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dynamic)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_exception)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_iterable_dataset)
+  list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dataset)
 endif()
 
 if(NOT WITH_GPU OR WIN32 OR APPLE)
@@ -362,17 +427,21 @@ if(WITH_DISTRIBUTE)
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_base")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_base")
 
-    # FIXME(seiriosX) will readd after PR 22957  Merged
+
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_ctr")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_lars")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_train")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_save_load")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_simnet_bow")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_simnet_bow")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_text_classification")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_train")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_word2vec")
 
+    # FIXME(seiriosX) will fix this
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_sparse_embedding_ctr")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo")
+
     py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
     py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS})
     py_test_modules(test_communicator_async MODULES test_communicator_async ENVS ${dist_ENVS})
@@ -382,13 +451,15 @@ if(WITH_DISTRIBUTE)
     py_test_modules(test_collective_optimizer MODULES test_collective_optimizer)
     if(NOT APPLE)
     	   py_test_modules(test_fleet_base MODULES test_fleet_base ENVS ${dist_ENVS})
+    	   py_test_modules(test_fleet_base_2 MODULES test_fleet_base_2 ENVS ${dist_ENVS})
+    	   py_test_modules(test_fleet_base_3 MODULES test_fleet_base_3 ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_recompute_meta_optimizer MODULES test_fleet_recompute_meta_optimizer ENVS ${dist_ENVS})
-	   py_test_modules(test_fleet_graph_execution_meta_optimizer MODULES test_fleet_graph_execution_meta_optimizer ENVS ${dist_ENVS})
-	   py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
+	       py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
            py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS})
            py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
+	   py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
         if(NOT WIN32)
             py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
             py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
@@ -418,6 +489,7 @@ if(WITH_DISTRIBUTE)
         bash_test_modules(test_launch_ps START_BASH test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch START_BASH test_fleet_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
 
+        # port range (20000, 23000) is reserved for dist-ops
         set(dist_ut_port 20001)
         foreach(TEST_OP ${DIST_TEST_OPS})
             bash_test_modules(${TEST_OP} START_BASH dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
@@ -457,13 +529,20 @@ if(NOT WIN32)
 endif()
 
 if(NOT APPLE AND NOT WIN32)
-    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 600)
-    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 600)
-    bash_test_modules(test_checkpoint_saver START_BASH dist_test.sh TIMEOUT 600)
+    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140)
 endif()
 
 add_subdirectory(sequence)
 add_subdirectory(dygraph_to_static)
+add_subdirectory(rnn)
 
 if (WITH_MKLDNN)
     add_subdirectory(mkldnn)
@@ -503,4 +582,13 @@ if(NOT WIN32 AND NOT APPLE)
     set_tests_properties(test_multiprocess_dataloader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
     set_tests_properties(test_multiprocess_dataloader_iterable_dataset_static PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
     set_tests_properties(test_multiprocess_dataloader_iterable_dataset_dynamic PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+    set_tests_properties(test_multiprocess_dataloader_dataset PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 endif()
+
+# setting timeout value for old unittests
+# set_tests_properties(test_dist_fleet_sparse_embedding_ctr PROPERTIES TIMEOUT 200)
+set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
+set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
diff --git a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
index 812730e9523f8d24ade68474b858e04b41fc6895..529ff4ec45d1fdc6d1d8e765e38cff53d36aade7 100644
--- a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
+++ b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
@@ -30,11 +30,11 @@ from paddle.fluid import unique_name
 import numpy as np
 from paddle.io import Dataset, BatchSampler, DataLoader
 
-BATCH_NUM = 20
-BATCH_SIZE = 16
+BATCH_NUM = 4
+BATCH_SIZE = 1
 
 #IMAGE_SIZE = 128
-CLASS_NUM = 10
+CLASS_NUM = 2
 
 USE_GPU = False  # whether use GPU to run model
 places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
@@ -59,7 +59,7 @@ def sample_list_generator_creator():
         for _ in range(BATCH_NUM):
             sample_list = []
             for _ in range(BATCH_SIZE):
-                image, label = get_random_images_and_labels([16, 16], [1])
+                image, label = get_random_images_and_labels([4, 4], [1])
                 sample_list.append([image, label])
 
             yield sample_list
@@ -75,8 +75,7 @@ class AutoCheckpointBase(unittest.TestCase):
                   minimize=True,
                   iterable=True):
         def simple_net():
-            image = fluid.data(
-                name='image', shape=[-1, 16, 16], dtype='float32')
+            image = fluid.data(name='image', shape=[-1, 4, 4], dtype='float32')
             label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
 
             fc_tmp = fluid.layers.fc(image, size=CLASS_NUM)
diff --git a/python/paddle/fluid/tests/unittests/collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective_allgather_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdf4ca07ae9b57e083137945f58aaabb571e20ec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_allgather_api.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveAllgatherAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tensor_list = []
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_gather(tensor_list, tindata)
+            return tensor_list
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllgatherAPI, "allgather")
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..aea429ae5e3e622ee1b584796ef87edc1d4c8d72
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_reduce(tindata)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllreduceAPI, "allreduce")
diff --git a/python/paddle/fluid/tests/unittests/collective_barrier_api.py b/python/paddle/fluid/tests/unittests/collective_barrier_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..09b3c27126d926ac7175f6045f385adf4d530b44
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_barrier_api.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveBarrierAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            paddle.distributed.barrier()
+            return []
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveBarrierAPI, "barrier")
diff --git a/python/paddle/fluid/tests/unittests/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..a879a027b50688234c8efb8468e6eac660d8a145
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.broadcast(tindata, src=1)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveBroadcastAPI, "broadcast")
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective_reduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e89b1cb3ee8550d3dbb4e1a055f092e57126c7f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_api.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveReduceAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.reduce(tindata, dst=0)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveReduceAPI, "reduce")
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op.py b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..da61284344b58d44c5ba02af5ed42c553f857c94
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+
+class TestCollectiveReduce(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        rootid = 1
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofreduce",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_reduce_sum",
+                inputs={'X': tindata},
+                attrs={'ring_id': ring_id,
+                       'root_id': rootid},
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveReduce, "reduce", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e6904286234364e7ae84a5c21b9826885f99dc4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+
+class TestCollectiveReduce(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        rootid = 1
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofreduce",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_reduce_sum",
+                inputs={'X': tindata},
+                attrs={
+                    'ring_id': ring_id,
+                    'use_calc_stream': True,
+                    'root_id': rootid
+                },
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveReduce, "reduce", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective_scatter_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..f68929ad3b36d5a0bf145a93b30172f0422dc9f9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_api.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveScatterAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata",
+                shape=[10, 1000],
+                dtype='float64',
+                append_batch_size=False)
+            toutdata = layers.fill_constant(
+                shape=[5, 1000], dtype='float64', value=1.0)
+            tensor_list = None
+            if rank == 1:
+                tensor_list = paddle.split(tindata, 2, axis=0)
+            paddle.distributed.scatter(toutdata, tensor_list, src=1)
+            return [toutdata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveScatterAPI, "scatter")
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_op.py b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..efe5e17bcce1ecddf859edbb3543876fe5fc9f89
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+
+class TestCollectiveScatter(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        rootid = 1
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofreduce",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_scatter",
+                inputs={'X': tindata},
+                attrs={'ring_id': ring_id,
+                       'root': rootid,
+                       'nranks': 2},
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveScatter, "scatter", 0)
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 2f3d3ced6f84e4f50b9f26a79c2874bfaeb440fa..73b546b95cfeb8032c6e99eabe24c883d1f5f66c 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -162,24 +162,17 @@ class TestDistCTR2x2(FleetDistRunnerBase):
 
         exe = fluid.Executor(fluid.CPUPlace())
         fleet.init_worker()
-        exe.run(fleet.startup_program)
-
+        exe.run(fluid.default_startup_program())
         batch_size = 4
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
         self.reader.decorate_sample_list_generator(train_reader)
 
-        compiled_prog = fluid.compiler.CompiledProgram(
-            fleet.main_program).with_data_parallel(
-                loss_name=self.avg_cost.name,
-                build_strategy=self.strategy.get_build_strategy(),
-                exec_strategy=self.strategy.get_execute_strategy())
-
         for epoch_id in range(1):
             self.reader.start()
             try:
                 pass_start = time.time()
                 while True:
-                    loss_val = exe.run(program=compiled_prog,
+                    loss_val = exe.run(program=fluid.default_main_program(),
                                        fetch_list=[self.avg_cost.name])
                     loss_val = np.mean(loss_val)
                     # TODO(randomly fail)
@@ -209,7 +202,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         exe = fluid.Executor(fluid.CPUPlace())
 
         fleet.init_worker()
-        exe.run(fleet.startup_program)
+        exe.run(fluid.default_startup_program())
 
         thread_num = 2
         batch_size = 128
@@ -231,7 +224,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             pass_start = time.time()
             dataset.set_filelist(filelist)
             exe.train_from_dataset(
-                program=fleet.main_program,
+                program=fluid.default_main_program(),
                 dataset=dataset,
                 fetch_list=[self.avg_cost],
                 fetch_info=["cost"],
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d0fa447daf3e3a502e7d77491045f92695496c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
@@ -0,0 +1,152 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Distribute CTR model for test fleet api
+"""
+
+from __future__ import print_function
+
+import shutil
+import tempfile
+import time
+
+import paddle
+import paddle.fluid as fluid
+import os
+import numpy as np
+
+import ctr_dataset_reader
+from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
+from dist_fleet_ctr import TestDistCTR2x2, fake_ctr_reader
+from paddle.distributed.fleet.base.util_factory import fleet_util
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+class TestDistGpuPsCTR2x2(TestDistCTR2x2):
+    """
+    For test CTR model, using Fleet api & PS-GPU
+    """
+
+    def check_model_right(self, dirname):
+        model_filename = os.path.join(dirname, "__model__")
+
+        with open(model_filename, "rb") as f:
+            program_desc_str = f.read()
+
+        program = fluid.Program.parse_from_string(program_desc_str)
+        with open(os.path.join(dirname, "__model__.proto"), "w") as wn:
+            wn.write(str(program))
+
+    def do_pyreader_training(self, fleet):
+        """
+        do training using dataset, using fetch handler to catch variable
+        Args:
+            fleet(Fleet api): the fleet object of Parameter Server, define distribute training role
+        """
+        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = fluid.CUDAPlace(device_id)
+        exe = fluid.Executor(place)
+        fleet.init_worker()
+        exe.run(fleet.startup_program)
+
+        batch_size = 4
+        train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
+        self.reader.decorate_sample_list_generator(train_reader)
+
+        for epoch_id in range(1):
+            self.reader.start()
+            try:
+                pass_start = time.time()
+                while True:
+                    loss_val = exe.run(program=fleet.main_program,
+                                       fetch_list=[self.avg_cost.name])
+                    loss_val = np.mean(loss_val)
+                    reduce_output = fleet_util.all_reduce(
+                        np.array(loss_val), mode="sum")
+                    loss_all_trainer = fleet_util.all_gather(float(loss_val))
+                    loss_val = float(reduce_output) / len(loss_all_trainer)
+                    message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
+                                                                      loss_val)
+                    fleet_util.print_on_rank(message, 0)
+
+                pass_time = time.time() - pass_start
+            except fluid.core.EOFException:
+                self.reader.reset()
+
+        model_dir = tempfile.mkdtemp()
+        fleet.save_inference_model(
+            exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost)
+        self.check_model_right(model_dir)
+        if fleet.is_first_worker():
+            fleet.save_persistables(executor=exe, dirname=model_dir)
+        shutil.rmtree(model_dir)
+        fleet.stop_worker()
+
+    def do_dataset_training(self, fleet):
+        dnn_input_dim, lr_input_dim, train_file_path = ctr_dataset_reader.prepare_data(
+        )
+
+        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = fluid.CUDAPlace(device_id)
+        exe = fluid.Executor(place)
+
+        fleet.init_worker()
+        exe.run(fleet.startup_program)
+
+        thread_num = 2
+        batch_size = 128
+        filelist = []
+        for _ in range(thread_num):
+            filelist.append(train_file_path)
+
+        # config dataset
+        dataset = paddle.fleet.DatasetFactory().create_dataset()
+        dataset.set_batch_size(batch_size)
+        dataset.set_use_var(self.feeds)
+        pipe_command = 'python ctr_dataset_reader.py'
+        dataset.set_pipe_command(pipe_command)
+
+        dataset.set_filelist(filelist)
+        dataset.set_thread(thread_num)
+
+        for epoch_id in range(1):
+            pass_start = time.time()
+            dataset.set_filelist(filelist)
+            exe.train_from_dataset(
+                program=fleet.main_program,
+                dataset=dataset,
+                fetch_list=[self.avg_cost],
+                fetch_info=["cost"],
+                print_period=2,
+                debug=int(os.getenv("Debug", "0")))
+            pass_time = time.time() - pass_start
+
+        if os.getenv("SAVE_MODEL") == "1":
+            model_dir = tempfile.mkdtemp()
+            fleet.save_inference_model(exe, model_dir,
+                                       [feed.name for feed in self.feeds],
+                                       self.avg_cost)
+            self.check_model_right(model_dir)
+            if fleet.is_first_worker():
+                fleet.save_persistables(executor=exe, dirname=model_dir)
+            shutil.rmtree(model_dir)
+
+        fleet.stop_worker()
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistGpuPsCTR2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
index c69e1247a9bb8f97350ae79bcc6df1bc645204ea..77697896b4d556da8a98c17e281b3d7a6999fd64 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
@@ -152,24 +152,18 @@ class TestDistCTR2x2(FleetDistRunnerBase):
 
         exe = fluid.Executor(fluid.CPUPlace())
         fleet.init_worker()
-        exe.run(fleet.startup_program)
+        exe.run(fluid.default_startup_program())
 
         batch_size = 4
 
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
         self.reader.decorate_sample_list_generator(train_reader)
 
-        compiled_prog = fluid.compiler.CompiledProgram(
-            fleet.main_program).with_data_parallel(
-                loss_name=self.avg_cost.name,
-                build_strategy=self.strategy.get_build_strategy(),
-                exec_strategy=self.strategy.get_execute_strategy())
-
         for epoch_id in range(1):
             self.reader.start()
             try:
                 while True:
-                    loss_val = exe.run(program=compiled_prog,
+                    loss_val = exe.run(program=fluid.default_main_program(),
                                        fetch_list=[self.avg_cost.name])
                     loss_val = np.mean(loss_val)
                     print("TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
diff --git a/python/paddle/fluid/tests/unittests/dist_test.sh b/python/paddle/fluid/tests/unittests/dist_test.sh
index 42566f63b68e2c24c459c0bd738455d1ec3bc3da..d5a6490042b20a4f9160c55fbb93e9f2f8092eae 100644
--- a/python/paddle/fluid/tests/unittests/dist_test.sh
+++ b/python/paddle/fluid/tests/unittests/dist_test.sh
@@ -61,7 +61,14 @@ for i in {1..2}; do
     fi
 done
 
+echo "dist space:"
+df -h
+
 #display /tmp/files
+echo "ls /tmp/paddle.*"
 ls -l /tmp/paddle.*
 
+echo "ls -l ./"
+ls -l ./
+
 exit 1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..75bff108dd43665df0fc1c8b166a935946b4fbc7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import create_paddle_predictor
+
+
+class PredictorTools(object):
+    '''
+    Paddle-Inference predictor
+    '''
+
+    def __init__(self, model_path, params_file, feeds_var):
+        '''
+        __init__
+        '''
+        self.model_path = model_path
+        self.params_file = params_file
+
+        self.feeds_var = feeds_var
+
+    def _load_model_and_set_config(self):
+        '''
+        load model from file and set analysis config 
+        '''
+        if os.path.exists(os.path.join(self.model_path, self.params_file)):
+            config = AnalysisConfig(
+                os.path.join(self.model_path, "__model__"),
+                os.path.join(self.model_path, self.params_file))
+        else:
+            config = AnalysisConfig(os.path.join(self.model_path))
+
+        if fluid.is_compiled_with_cuda():
+            config.enable_use_gpu(100, 0)
+        else:
+            config.disable_gpu()
+        config.switch_specify_input_names(True)
+        config.switch_use_feed_fetch_ops(False)
+        config.enable_memory_optim()
+        config.disable_glog_info()
+        config.switch_ir_optim(True)
+
+        return config
+
+    def _get_analysis_outputs(self, config):
+        '''
+        Return outputs of paddle inference
+        Args:
+            config (AnalysisConfig): predictor configs
+        Returns:
+            outs (numpy array): forward netwrok prediction outputs
+        '''
+        predictor = create_paddle_predictor(config)
+        tensor_shapes = predictor.get_input_tensor_shape()
+        names = predictor.get_input_names()
+        for i, name in enumerate(names):
+            #assert name in self.feeds_var, '{} not in feeded dict'.format(name)
+            shape = tensor_shapes[name]
+            tensor = predictor.get_input_tensor(name)
+            feed_data = self.feeds_var[i]
+            tensor.copy_from_cpu(np.array(feed_data))
+            if type(feed_data) == fluid.LoDTensor:
+                tensor.set_lod(feed_data.lod())
+
+        # ensure no diff in multiple repeat times
+        repeat_time = 10
+        for i in range(repeat_time):
+            predictor.zero_copy_run()
+
+        output_names = predictor.get_output_names()
+        outs = [
+            predictor.get_output_tensor(out_name).copy_to_cpu()
+            for out_name in output_names
+        ]
+
+        return outs
+
+    def __call__(self):
+        '''
+        __call__
+        '''
+        config = self._load_model_and_set_config()
+        outputs = self._get_analysis_outputs(config)
+
+        return outputs
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py
index 68e6f328726f5b2664d31ac46394fa451631388c..d4646833ea2bd4e3c2c07a1962e4e866bdfe776e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py
@@ -17,12 +17,13 @@ from __future__ import print_function
 import numpy
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.jit import declarative
 
 
-@declarative
+@paddle.jit.to_static
 def dyfunc_assert_variable(x):
     x_v = fluid.dygraph.to_variable(x)
     assert x_v
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
index 27777a62799e104ac8a08fd67df8bdbe2a256724..f105dd5e94744ecca96ee0282432ff4946ab5e04 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
@@ -23,6 +23,8 @@ from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 from bert_dygraph_model import PretrainModelLayer
 from bert_utils import get_bert_config, get_feed_data_reader
 
+from predictor_utils import PredictorTools
+
 program_translator = ProgramTranslator()
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
 )
@@ -152,6 +154,12 @@ def predict_dygraph_jit(data):
         return pred_res
 
 
+def predict_analysis_inference(data):
+    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, data)
+    out = output()
+    return out
+
+
 class TestBert(unittest.TestCase):
     def setUp(self):
         self.bert_config = get_bert_config()
@@ -178,9 +186,11 @@ class TestBert(unittest.TestCase):
             dygraph_pred_res = predict_dygraph(self.bert_config, data)
             static_pred_res = predict_static(data)
             dygraph_jit_pred_res = predict_dygraph_jit(data)
+            predictor_pred_res = predict_analysis_inference(data)
 
-            for dy_res, st_res, dy_jit_res in zip(
-                    dygraph_pred_res, static_pred_res, dygraph_jit_pred_res):
+            for dy_res, st_res, dy_jit_res, predictor_res in zip(
+                    dygraph_pred_res, static_pred_res, dygraph_jit_pred_res,
+                    predictor_pred_res):
                 self.assertTrue(
                     np.allclose(st_res, dy_res),
                     "dygraph_res: {},\n static_res: {}".format(
@@ -191,6 +201,11 @@ class TestBert(unittest.TestCase):
                     "dygraph_jit_res: {},\n static_res: {}".format(
                         dy_jit_res[~np.isclose(st_res, dy_jit_res)],
                         st_res[~np.isclose(st_res, dy_jit_res)]))
+                self.assertTrue(
+                    np.allclose(st_res, predictor_res),
+                    "dygraph_jit_res: {},\n static_res: {}".format(
+                        predictor_res[~np.isclose(st_res, predictor_res)],
+                        st_res[~np.isclose(st_res, predictor_res)]))
             break
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index c01705dbe9ba655d9cfb538dfdde0474ffa30855..af7e73c41464dbd26c476f20d4a1533e37d34ce3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -15,13 +15,16 @@
 import math
 import numpy as np
 import unittest
-
+import paddle
+from paddle.jit import to_static
 import paddle.fluid as fluid
 from paddle.fluid import ParamAttr
 from paddle.fluid.dygraph import to_variable
-from paddle.fluid.dygraph import declarative, ProgramTranslator
+from paddle.fluid.dygraph import ProgramTranslator
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 DATATYPE = 'float32'
 program_translator = ProgramTranslator()
@@ -240,7 +243,7 @@ class BMN(fluid.dygraph.Layer):
             param_attr=ParamAttr(name="PEM_2d4_w"),
             bias_attr=ParamAttr(name="PEM_2d4_b"))
 
-    @declarative
+    @to_static
     def forward(self, x):
         # Base Module
         x = self.b_conv1(x)
@@ -558,8 +561,8 @@ def train_bmn(args, place, to_static):
     loss_data = []
 
     with fluid.dygraph.guard(place):
-        fluid.default_main_program().random_seed = SEED
-        fluid.default_startup_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         global local_random
         local_random = np.random.RandomState(SEED)
 
@@ -693,9 +696,11 @@ class TestTrain(unittest.TestCase):
             static_pred_res = self.predict_static(video_data)
             dygraph_pred_res = self.predict_dygraph(video_data)
             dygraph_jit_pred_res = self.predict_dygraph_jit(video_data)
+            predictor_pred_res = self.predict_analysis_inference(video_data)
 
-            for dy_res, st_res, dy_jit_res in zip(
-                    dygraph_pred_res, static_pred_res, dygraph_jit_pred_res):
+            for dy_res, st_res, dy_jit_res, predictor_res in zip(
+                    dygraph_pred_res, static_pred_res, dygraph_jit_pred_res,
+                    predictor_pred_res):
                 self.assertTrue(
                     np.allclose(st_res, dy_res),
                     "dygraph_res: {},\n static_res: {}".format(
@@ -706,6 +711,11 @@ class TestTrain(unittest.TestCase):
                     "dygraph_jit_res: {},\n static_res: {}".format(
                         dy_jit_res[~np.isclose(st_res, dy_jit_res)],
                         st_res[~np.isclose(st_res, dy_jit_res)]))
+                self.assertTrue(
+                    np.allclose(st_res, predictor_res),
+                    "dygraph_jit_res: {},\n static_res: {}".format(
+                        predictor_res[~np.isclose(st_res, predictor_res)],
+                        st_res[~np.isclose(st_res, predictor_res)]))
             break
 
     def predict_dygraph(self, data):
@@ -749,6 +759,11 @@ class TestTrain(unittest.TestCase):
 
             return pred_res
 
+    def predict_analysis_inference(self, data):
+        output = PredictorTools(self.args.infer_dir, VARIABLE_FILENAME, [data])
+        out = output()
+        return out
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a689354f56757ba754b76e3d407cb7083b95b3b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from paddle.static import InputSpec
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import to_variable, declarative, ProgramTranslator, Layer, jit
+
+import unittest
+
+program_trans = ProgramTranslator()
+
+
+class SimpleNet(Layer):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.linear = fluid.dygraph.Linear(10, 3)
+
+    @declarative(input_spec=[InputSpec(shape=[None, 10], dtype='float32')])
+    def forward(self, x, a=1, b=2):
+        y = self.inner_function(x)
+        return y
+
+    # `declarative` is not essential, add it to test for robustness.
+    @declarative
+    def inner_function(self, x):
+        y = self.linear(x)
+        return y
+
+    def add_func(self, x, y):
+        z = x + y
+        return z
+
+    @declarative(input_spec=[[InputSpec([None, 10]), InputSpec([None, 10])]])
+    def func_with_list(self, l):
+        x, y, int_val = l
+        z = x + y
+        z = z + int_val
+        return z
+
+    @declarative(input_spec=[{
+        'x': InputSpec([None, 10]),
+        'y': InputSpec([None, 10])
+    }])
+    def func_with_dict(self, d):
+        x = d['x']
+        y = d['y']
+        int_val = d['int_val']
+
+        z = x + y
+        z = z + int_val
+
+        return z
+
+    @declarative(input_spec=[[
+        InputSpec([None]), {
+            'x': InputSpec([None, 10]),
+            'y': InputSpec([None, 10])
+        }
+    ]])
+    def func_with_list_dict(self, dl):
+        bias = dl[0]
+        x = dl[1]['x']
+        y = dl[1]['y']
+
+        z = x + y
+        z = z + bias
+
+        return z
+
+
+class TestInputSpec(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def test_with_input_spec(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            x = to_variable(np.ones([4, 10]).astype('float32'))
+            y = to_variable(np.ones([4, 10]).astype('float32') * 2)
+            int_val = 4.
+
+            net = SimpleNet()
+
+            # 1. each method holds independent program cache
+            out = net(x)
+            self.assertTrue(len(net.forward.program_cache) == 1)
+
+            # 2. test save load
+            jit.save(net, './simple_net')
+            infer_net = fluid.dygraph.jit.load('./simple_net')
+            pred = infer_net(x)
+            self.assertTrue(np.allclose(out.numpy(), pred.numpy()))
+
+            # 3. we can decorate any method
+            x_2 = to_variable(np.ones([4, 20]).astype('float32'))
+            # uses `declarative(func)` instead of `@declarative`
+            net.add_func = declarative(net.add_func)
+            out = net.add_func(x_2, np.ones([20]).astype('float32'))
+            self.assertTrue(len(net.add_func.program_cache) == 1)
+
+            # 5. test input with list
+            out = net.func_with_list([x, y, int_val])
+
+            # 6. test input with dict
+            out = net.func_with_dict({'x': x, 'y': y, 'int_val': int_val})
+
+            # 7. test input with lits contains dict
+            int_np = np.ones([1]).astype('float32')
+            out = net.func_with_list_dict([int_np, {'x': x, 'y': y}])
+
+    def test_with_error(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            x = to_variable(np.ones([4, 10]).astype('float32'))
+            y = to_variable(np.ones([4, 10]).astype('float32') * 2)
+            int_val = 4.
+
+            net = SimpleNet()
+
+            # 1. kwargs and input_spec should not be specificed in same time
+            with self.assertRaises(ValueError):
+                net(x, a=1, other_kwarg=2)
+
+            # 2. requires len(input_spec) <= len(args)
+            with self.assertRaises(ValueError):
+                net.add_func = declarative(
+                    net.add_func,
+                    input_spec=[
+                        InputSpec([-1, 10]), InputSpec([-1, 10]),
+                        InputSpec([10])
+                    ])
+                net.add_func(x, y)
+
+    def test_concrete_program(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            x = to_variable(np.ones([4, 10]).astype('float32'))
+            y = to_variable(np.ones([4, 10]).astype('float32') * 2)
+            int_val = 4.
+
+            net = SimpleNet()
+            # We can get concrete_program by specificing InputSpec information. Faking input is no need.
+            net.add_func = declarative(
+                net.add_func,
+                input_spec=[
+                    InputSpec([-1, 10]), InputSpec(
+                        [-1, 10], name='y')
+                ])
+            cp1 = net.add_func.concrete_program
+            self.assertTrue(cp1.inputs[-1].shape == (-1, 10))
+            self.assertTrue(cp1.inputs[-1].name == 'y')
+
+            # generate another program
+            net.add_func = declarative(
+                net.add_func,
+                input_spec=[InputSpec([10]), InputSpec(
+                    [10], name='label')])
+            cp2 = net.add_func.concrete_program
+            self.assertTrue(cp2.inputs[-1].shape == (10, ))
+            self.assertTrue(cp2.inputs[-1].name == 'label')
+            # Note(Aurelius84): New instance will be returned if we use `declarative(foo)` every time.
+            # So number of cache program is 1.
+            self.assertTrue(len(net.add_func.program_cache) == 1)
+            self.assertTrue(cp1 != cp2)
+
+
+def foo_func(a, b, c=1, d=2):
+    z = a + b
+    return z
+
+
+class TestDifferentInputSpecCacheProgram(unittest.TestCase):
+    def test_with_different_input(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            x_data = np.ones([16, 10]).astype('float32')
+            y_data = np.ones([10]).astype('float32') * 2
+            z_data = np.ones([10]).astype('float32') * 2.2
+
+            foo = declarative(foo_func)
+
+            # [16, 10] + [10] (varbase)
+            out_1 = foo(to_variable(x_data), to_variable(y_data))
+            self.assertTrue(np.allclose(x_data + y_data, out_1.numpy()))
+            self.assertTrue(len(foo.program_cache) == 1)
+
+            # [16, 10] + [10] (numpy)
+            out_2 = foo(to_variable(x_data), y_data)
+            self.assertTrue(np.allclose(x_data + y_data, out_2.numpy()))
+            self.assertTrue(len(foo.program_cache) == 1)
+
+            # [16, 10] + [10] (numpy)
+            out_3 = foo(to_variable(x_data), z_data)
+            self.assertTrue(np.allclose(x_data + z_data, out_3.numpy()))
+            # hit cache program
+            self.assertTrue(len(foo.program_cache) == 1)
+
+            # [16, 10] + [10] (numpy) with other different arguments (c=3)
+            out_4 = foo(to_variable(x_data), z_data, 3)
+            self.assertTrue(np.allclose(x_data + z_data, out_4.numpy()))
+            # create a new program
+            self.assertTrue(len(foo.program_cache) == 2)
+
+    def test_get_concrete_program(self):
+
+        foo = declarative(foo_func)
+
+        # 1. specific InputSpec for `x`/`y`
+        concrete_program_1 = foo.get_concrete_program(
+            InputSpec([None, 10]), InputSpec([10]))
+        print(concrete_program_1)
+        self.assertTrue(len(foo.program_cache) == 1)
+
+        # 2. specific `c`/`d` explicitly with same default value
+        concrete_program_2 = foo.get_concrete_program(
+            InputSpec([None, 10]), InputSpec([10]), 1, 2)
+        self.assertTrue(concrete_program_2 == concrete_program_1)
+        self.assertTrue(len(foo.program_cache) == 1)
+
+        # 3. specific `c` = 2
+        concrete_program_3 = foo.get_concrete_program(
+            InputSpec([None, 10]), InputSpec([10]), c=2)
+        self.assertTrue(concrete_program_3 != concrete_program_1)
+        self.assertTrue(len(foo.program_cache) == 2)
+
+        # 4. specific x.shape = [10]
+        concrete_program_4 = foo.get_concrete_program(
+            InputSpec([10]), InputSpec([10]))
+        self.assertTrue(concrete_program_4 != concrete_program_1)
+        self.assertTrue(len(foo.program_cache) == 3)
+
+        # 5. only specific InputSpec of x
+        with self.assertRaises(ValueError):
+            concrete_program_5 = foo.get_concrete_program(InputSpec([10]))
+
+        # 6. specific unknown kwargs `e`=4
+        concrete_program_5 = foo.get_concrete_program(
+            InputSpec([10]), InputSpec([10]), e=4)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
index c8051b3f24170693259b17456bcc124221117900..af1e44ffe212343c02a9bed8a8cacb0a966451aa 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
@@ -19,7 +19,7 @@ import numpy as np
 import unittest
 
 import paddle.fluid as fluid
-from paddle.fluid.dygraph.jit import declarative
+from paddle.jit import to_static
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
 
 PLACE = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
@@ -76,7 +76,7 @@ class MainNetWithDict(fluid.dygraph.Layer):
         self.output_size = output_size
         self.sub_net = SubNetWithDict(hidden_size, output_size)
 
-    @declarative
+    @to_static
     def forward(self, input, max_len=4):
         input = fluid.dygraph.to_variable(input)
         cache = {
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..88697bc1b36838afd743596cfec036271be33856
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.static import InputSpec
+from paddle.fluid.dygraph.dygraph_to_static.function_spec import FunctionSpec
+
+from test_declarative import foo_func
+
+import unittest
+
+
+class TestFunctionSpec(unittest.TestCase):
+    def test_constructor(self):
+        foo_spec = FunctionSpec(foo_func)
+        args_name = foo_spec.args_name
+        self.assertListEqual(args_name, ['a', 'b', 'c', 'd'])
+        self.assertTrue(foo_spec.dygraph_function == foo_func)
+        self.assertTrue(foo_spec.input_spec is None)
+
+    def test_verify_input_spec(self):
+        a_spec = InputSpec([None, 10], name='a')
+        b_spec = InputSpec([10], name='b')
+
+        # type(input_spec) should be list or tuple
+        with self.assertRaises(TypeError):
+            foo_spec = FunctionSpec(foo_func, input_spec=a_spec)
+
+        # each element of input_spec should be `InputSpec`
+        with self.assertRaises(ValueError):
+            foo_spec = FunctionSpec(foo_func, input_spec=[a_spec, 10])
+
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec, b_spec])
+        self.assertTrue(len(foo_spec.flat_input_spec) == 2)
+
+    def test_unified_args_and_kwargs(self):
+        foo_spec = FunctionSpec(foo_func)
+        # case 1: foo(10, 20, c=4)
+        args, kwargs = foo_spec.unified_args_and_kwargs([10, 20], {'c': 4})
+        self.assertTupleEqual(args, (10, 20, 4, 2))
+        self.assertTrue(len(kwargs) == 0)
+
+        # case 2: foo(a=10, b=20, d=4)
+        args, kwargs = foo_spec.unified_args_and_kwargs(
+            [], {'a': 10,
+                 'b': 20,
+                 'd': 4})
+        self.assertTupleEqual(args, (10, 20, 1, 4))
+        self.assertTrue(len(kwargs) == 0)
+
+        # case 3: foo(10, b=20)
+        args, kwargs = foo_spec.unified_args_and_kwargs([10], {'b': 20})
+        self.assertTupleEqual(args, (10, 20, 1, 2))
+        self.assertTrue(len(kwargs) == 0)
+
+        # assert len(self._arg_names) >= len(args)
+        with self.assertRaises(ValueError):
+            foo_spec.unified_args_and_kwargs([10, 20, 30, 40, 50], {'c': 4})
+
+        # assert arg_name should be in kwargs
+        with self.assertRaises(ValueError):
+            foo_spec.unified_args_and_kwargs([10], {'c': 4})
+
+    def test_args_to_input_spec(self):
+        a_spec = InputSpec([None, 10], name='a')
+        b_spec = InputSpec([10], name='b')
+
+        a_tensor = paddle.static.data(name='a_var', shape=[4, 10])
+        b_tensor = paddle.static.data(name='b_var', shape=[4, 10])
+        kwargs = {'c': 1, 'd': 2}
+
+        # case 1
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec, b_spec])
+        input_with_spec = foo_spec.args_to_input_spec(
+            (a_tensor, b_tensor, 1, 2), {})
+        self.assertTrue(len(input_with_spec) == 4)
+        self.assertTrue(input_with_spec[0] == a_spec)  # a
+        self.assertTrue(input_with_spec[1] == b_spec)  # b
+        self.assertTrue(input_with_spec[2] == 1)  # c
+        self.assertTrue(input_with_spec[3] == 2)  # d
+
+        # case 2
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec])
+        input_with_spec = foo_spec.args_to_input_spec((a_tensor, b_tensor), {})
+        self.assertTrue(len(input_with_spec) == 2)
+        self.assertTrue(input_with_spec[0] == a_spec)  # a
+        self.assertTupleEqual(input_with_spec[1].shape, (4, 10))  # b.shape
+        self.assertEqual(input_with_spec[1].name, 'b_var')  # b.name
+
+        # case 3
+        # assert kwargs is None if set `input_spec`
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec])
+        with self.assertRaises(ValueError):
+            input_with_spec = foo_spec.args_to_input_spec((a_tensor, b_tensor),
+                                                          {'c': 4})
+
+        # case 4
+        # assert len(args) >= len(self._input_spec)
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec, b_spec])
+        with self.assertRaises(ValueError):
+            input_with_spec = foo_spec.args_to_input_spec((a_tensor, ), {})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index fdf6daf6263e2bb7cf8ef2c3ad1373fb079f0037..4d735b565ddbcd0bea4e879f0ae5881e459c8f1d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -21,12 +21,15 @@ import unittest
 import os
 os.environ["CUDA_VISIBLE_DEVICES"] = "2"
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph import Embedding, Linear, GRUUnit
 from paddle.fluid.dygraph import declarative, ProgramTranslator
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 
 program_translator = ProgramTranslator()
@@ -446,8 +449,8 @@ def do_train(args, to_static):
     place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
     ) else fluid.CPUPlace()
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         reader = get_random_input_data(args.batch_size, args.vocab_size,
                                        args.num_labels)
@@ -536,6 +539,7 @@ class TestLACModel(unittest.TestCase):
             dy_pre = self.predict_dygraph(batch)
             st_pre = self.predict_static(batch)
             dy_jit_pre = self.predict_dygraph_jit(batch)
+            predictor_pre = self.predict_analysis_inference(batch)
             self.assertTrue(
                 np.allclose(dy_pre, st_pre),
                 msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
@@ -543,6 +547,10 @@ class TestLACModel(unittest.TestCase):
                 np.allclose(dy_jit_pre, st_pre),
                 msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre,
                                                                st_pre))
+            self.assertTrue(
+                np.allclose(predictor_pre, st_pre),
+                msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
+                                                                  st_pre))
 
     def predict_dygraph(self, batch):
         words, targets, length = batch
@@ -591,6 +599,14 @@ class TestLACModel(unittest.TestCase):
 
             return pred_res.numpy()
 
+    def predict_analysis_inference(self, batch):
+        words, targets, length = batch
+
+        output = PredictorTools(self.args.model_save_dir, VARIABLE_FILENAME,
+                                [words, length])
+        out = output()
+        return out
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..214cd95d3bc620b3bcadb88e57c7e54a593eaaf4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import io
+import logging
+import os
+import sys
+import unittest
+
+import gast
+import six
+
+import paddle
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
+
+# TODO(liym27): library mock needs to be installed separately in PY2,
+#  but CI environment has not installed mock yet.
+#  After discuss with Tian Shuo, now use mock only in PY3, and use it in PY2 after CI installs it.
+if six.PY3:
+    from unittest import mock
+# else:
+#     import mock
+
+
+class TestLoggingUtils(unittest.TestCase):
+    def setUp(self):
+        self.verbosity_level = 1
+        self.code_level = 3
+        self.translator_logger = logging_utils._TRANSLATOR_LOGGER
+
+    def test_verbosity(self):
+        paddle.jit.set_verbosity(None)
+        os.environ[logging_utils.VERBOSITY_ENV_NAME] = '3'
+        self.assertEqual(logging_utils.get_verbosity(), 3)
+
+        paddle.jit.set_verbosity(self.verbosity_level)
+        self.assertEqual(self.verbosity_level, logging_utils.get_verbosity())
+
+        # String is not supported
+        with self.assertRaises(TypeError):
+            paddle.jit.set_verbosity("3")
+
+        with self.assertRaises(TypeError):
+            paddle.jit.set_verbosity(3.3)
+
+    def test_code_level(self):
+
+        paddle.jit.set_code_level(None)
+        os.environ[logging_utils.CODE_LEVEL_ENV_NAME] = '2'
+        self.assertEqual(logging_utils.get_code_level(), 2)
+
+        paddle.jit.set_code_level(self.code_level)
+        self.assertEqual(logging_utils.get_code_level(), self.code_level)
+
+        paddle.jit.set_code_level(9)
+        self.assertEqual(logging_utils.get_code_level(), 9)
+
+        with self.assertRaises(TypeError):
+            paddle.jit.set_code_level(3.3)
+
+    def test_log(self):
+        stream = io.BytesIO() if six.PY2 else io.StringIO()
+        log = self.translator_logger.logger
+        stdout_handler = logging.StreamHandler(stream)
+        log.addHandler(stdout_handler)
+
+        warn_msg = "test_warn"
+        error_msg = "test_error"
+        log_msg_1 = "test_log_1"
+        log_msg_2 = "test_log_2"
+
+        if six.PY3:
+            with mock.patch.object(sys, 'stdout', stream):
+                logging_utils.warn(warn_msg)
+                logging_utils.error(error_msg)
+                self.translator_logger.verbosity_level = 2
+                logging_utils.log(1, log_msg_1)
+                logging_utils.log(2, log_msg_2)
+
+            result_msg = '\n'.join([warn_msg, error_msg, log_msg_2, ""])
+            self.assertEqual(result_msg, stream.getvalue())
+
+    def test_log_transformed_code(self):
+        source_code = "x = 3"
+        ast_code = gast.parse(source_code)
+
+        stream = io.BytesIO() if six.PY2 else io.StringIO()
+        log = self.translator_logger.logger
+        stdout_handler = logging.StreamHandler(stream)
+        log.addHandler(stdout_handler)
+
+        if six.PY3:
+            with mock.patch.object(sys, 'stdout', stream):
+                paddle.jit.set_code_level(1)
+                logging_utils.log_transformed_code(1, ast_code,
+                                                   "BasicApiTransformer")
+
+                paddle.jit.set_code_level()
+                logging_utils.log_transformed_code(
+                    logging_utils.LOG_AllTransformer, ast_code,
+                    "All Transformers")
+
+            self.assertIn(source_code, stream.getvalue())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index b8aa0379638fadd19b4956a56c1a3e4811558535..1ef3bd1bf150056816283c83fa3ff6af1e589732 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -25,10 +25,11 @@ from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph.nn import Conv2D, Linear, Pool2D
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 
 
@@ -100,7 +101,7 @@ class MNIST(fluid.dygraph.Layer):
                     loc=0.0, scale=scale)),
             act="softmax")
 
-    @declarative
+    @paddle.jit.to_static
     def forward(self, inputs, label=None):
         x = self.inference(inputs)
         if label is not None:
@@ -132,7 +133,7 @@ class TestMNIST(unittest.TestCase):
             drop_last=True)
 
 
-class TestMNISTWithDeclarative(TestMNIST):
+class TestMNISTWithToStatic(TestMNIST):
     """
     Tests model if doesn't change the layers while decorated
     by `dygraph_to_static_output`. In this case, everything should
@@ -145,7 +146,7 @@ class TestMNISTWithDeclarative(TestMNIST):
     def train_dygraph(self):
         return self.train(to_static=False)
 
-    def test_mnist_declarative(self):
+    def test_mnist_to_static(self):
         dygraph_loss = self.train_dygraph()
         static_loss = self.train_static()
         self.assertTrue(
@@ -220,6 +221,10 @@ class TestMNISTWithDeclarative(TestMNIST):
             dygraph_infer_out = self.jit_load_and_run_inference_dygraph(
                 infer_model_path, inputs)
             self.assertTrue(np.allclose(gt_out.numpy(), dygraph_infer_out))
+            # load in Paddle-Inference
+            predictor_infer_out = self.predictor_load_and_run_inference_analysis(
+                infer_model_path, inputs)
+            self.assertTrue(np.allclose(gt_out.numpy(), predictor_infer_out))
 
     @switch_to_static_graph
     def jit_load_and_run_inference_static(self, model_path, inputs):
@@ -241,6 +246,11 @@ class TestMNISTWithDeclarative(TestMNIST):
         pred = infer_net(inputs[0])
         return pred.numpy()
 
+    def predictor_load_and_run_inference_analysis(self, model_path, inputs):
+        output = PredictorTools(model_path, VARIABLE_FILENAME, inputs)
+        out = output()
+        return out
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index ef0f6e7f0831eea8d2f694413c5231ecea292ff4..a377075062b268723aaa3cb17bfa25d6b181798d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -14,6 +14,7 @@
 
 import time
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
@@ -23,6 +24,8 @@ from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
 import unittest
 
+from predictor_utils import PredictorTools
+
 # Note: Set True to eliminate randomness.
 #     1. For one operation, cuDNN has several algorithms,
 #        some algorithm results are non-deterministic, like convolution algorithms.
@@ -445,8 +448,8 @@ def train_mobilenet(args, to_static):
     with fluid.dygraph.guard(args.place):
 
         np.random.seed(SEED)
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         if args.model == "MobileNetV1":
             net = MobileNetV1(class_dim=args.class_dim, scale=1.0)
@@ -550,6 +553,12 @@ def predict_dygraph_jit(args, data):
         return pred_res.numpy()
 
 
+def predict_analysis_inference(args, data):
+    output = PredictorTools(args.model_save_path, VARIABLE_FILENAME, [data])
+    out = output()
+    return out
+
+
 class TestMobileNet(unittest.TestCase):
     def setUp(self):
         self.args = Args()
@@ -577,12 +586,18 @@ class TestMobileNet(unittest.TestCase):
         dy_pre = predict_dygraph(self.args, image)
         st_pre = predict_static(self.args, image)
         dy_jit_pre = predict_dygraph_jit(self.args, image)
+        predictor_pre = predict_analysis_inference(self.args, image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
         self.assertTrue(
             np.allclose(dy_jit_pre, st_pre),
             msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
+        self.assertTrue(
+            np.allclose(
+                predictor_pre, st_pre, atol=1e-5),
+            msg="inference_pred_res:\n {}\n, st_pre: \n{}.".format(
+                predictor_pre, st_pre))
 
     def test_mobile_net(self):
         # MobileNet-V1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
index 3da60e955deee9b6d4c74ba5ff1a550ae135afdb..f0fbe54f9dbbf93121655e784601467c13b3a70d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
@@ -133,7 +133,7 @@ class TestWithTrainAndEval(unittest.TestCase):
             x = fluid.dygraph.to_variable(x_data)
             linear_net(x)
 
-            _, partial_layer = program_translator.get_program_cache().last()[-1]
+            _, partial_layer = linear_net.forward.program_cache.last()[-1]
             # check default mode is for training
             self.assertEqual(partial_layer.program,
                              partial_layer._train_program)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
index 790319936ac015db09e45b9eac799b3bdf0b0250..df2b69297bb4d9167fa3f1a1fe0005a77ededf8a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
@@ -19,7 +19,7 @@ import time
 import unittest
 
 import numpy as np
-
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.base import to_variable
@@ -218,8 +218,8 @@ def train(place):
     batch_num = 200
 
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         ptb_model = PtbModel(
             hidden_size=hidden_size,
             vocab_size=vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
index 4813930159744fae362aec7563ea5cda82d958c5..1d211197ebd48f1d0fba87ef807c61a3315ed153 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
@@ -16,6 +16,7 @@ import gym
 import math
 import itertools
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.dygraph.nn as nn
 from paddle.fluid.dygraph import to_variable, Layer
@@ -64,8 +65,8 @@ def train(args, place, to_static):
     env.seed(SEED)
 
     with fluid.dygraph.guard(place):
-        fluid.default_main_program().random_seed = SEED
-        fluid.default_startup_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         local_random = np.random.RandomState(SEED)
 
         policy = Policy()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 90d210eba1e0fb1eeaf5eb0c8cbc0ff46c35328f..6556b2f03bd5304e290792d07d1d969ab255bfdc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -26,12 +26,15 @@ from paddle.fluid.dygraph import declarative, ProgramTranslator
 from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 IMAGENET1000 = 1281167
 base_lr = 0.001
 momentum_rate = 0.9
 l2_decay = 1e-4
-batch_size = 8
+# NOTE: Reduce batch_size from 8 to 2 to avoid unittest timeout.
+batch_size = 2
 epoch_num = 1
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
     else fluid.CPUPlace()
@@ -212,8 +215,8 @@ def train(to_static):
     """
     with fluid.dygraph.guard(place):
         np.random.seed(SEED)
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         train_reader = paddle.batch(
             reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
@@ -306,6 +309,12 @@ def predict_dygraph_jit(data):
         return pred_res.numpy()
 
 
+def predict_analysis_inference(data):
+    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, [data])
+    out = output()
+    return out
+
+
 class TestResnet(unittest.TestCase):
     def train(self, to_static):
         program_translator.enable(to_static)
@@ -316,12 +325,17 @@ class TestResnet(unittest.TestCase):
         dy_pre = predict_dygraph(image)
         st_pre = predict_static(image)
         dy_jit_pre = predict_dygraph_jit(image)
+        predictor_pre = predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
         self.assertTrue(
             np.allclose(dy_jit_pre, st_pre),
             msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
+        self.assertTrue(
+            np.allclose(predictor_pre, st_pre),
+            msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
+                                                              st_pre))
 
     def test_resnet(self):
         static_loss = self.train(to_static=True)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
index 0386b7c7a17a0f93040fa18d688347f30f27850d..6cf59c030c00384b225d5d13160f68a3558084b9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
@@ -133,7 +133,7 @@ class TestPartialProgramRaiseError(unittest.TestCase):
             x = fluid.dygraph.to_variable(x_data)
             out = net(x)
 
-            program_cache = program_translator.get_program_cache()
+            program_cache = SimpleFcLayer.forward.program_cache
             _, (concrete_program, _) = program_cache.last()
 
             params = concrete_program.parameters
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index c34e9478c8eab38c429c01db5fae460eeac6a4bd..38e4d5ad5480beb195bcc0c3cc21f033df8fbd5d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -26,6 +26,8 @@ from paddle.fluid.dygraph import declarative
 from paddle.fluid.dygraph import ProgramTranslator
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 np.random.seed(SEED)
 
@@ -329,8 +331,8 @@ def train(train_reader, to_static):
     np.random.seed(SEED)
 
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         se_resnext = SeResNeXt()
         optimizer = optimizer_setting(train_parameters, se_resnext.parameters())
 
@@ -434,6 +436,12 @@ def predict_dygraph_jit(data):
         return pred_res.numpy()
 
 
+def predict_analysis_inference(data):
+    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, [data])
+    out = output()
+    return out
+
+
 class TestSeResnet(unittest.TestCase):
     def setUp(self):
         self.train_reader = paddle.batch(
@@ -447,12 +455,17 @@ class TestSeResnet(unittest.TestCase):
         dy_pre = predict_dygraph(image)
         st_pre = predict_static(image)
         dy_jit_pre = predict_dygraph_jit(image)
+        predictor_pre = predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
         self.assertTrue(
             np.allclose(dy_jit_pre, st_pre),
             msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
+        self.assertTrue(
+            np.allclose(predictor_pre, st_pre),
+            msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
+                                                              st_pre))
 
     def test_check_result(self):
         pred_1, loss_1, acc1_1, acc5_1 = train(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
index fd5a58be26be43996bbb1f80557512bf974de52f..2aa3396fb7f8534374746329af43d4f823e4d5cf 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
@@ -15,6 +15,7 @@ import time
 import unittest
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.nn import Conv2D, Linear, Embedding
 from paddle.fluid.dygraph import to_variable, ProgramTranslator, declarative
@@ -285,8 +286,8 @@ def train(args, to_static):
 
     with fluid.dygraph.guard(place):
         np.random.seed(SEED)
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         train_reader = fake_data_reader(args.class_num, args.vocab_size,
                                         args.batch_size, args.padding_size)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
index 552a6307f33378e7b35f84e048729d22a063c796..14b9ac2e99584b35ec0821949b56b6ee92076571 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
@@ -108,8 +108,8 @@ def train(conf_dict, to_static):
         place = fluid.CPUPlace()
 
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         conf_dict['dict_size'] = len(vocab)
         conf_dict['seq_len'] = args.seq_len
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
index 7aa465949eb704ee5f23b3ad44f3fc57adb04154..4fc8d27d30cb8f67c30bbcd8dcd30938f906462d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
@@ -18,6 +18,7 @@ import time
 import unittest
 
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 
 import transformer_util as util
@@ -31,10 +32,11 @@ STEP_NUM = 10
 
 
 def train_static(args, batch_generator):
+    paddle.manual_seed(SEED)
+    paddle.framework.random._manual_program_seed(SEED)
     train_prog = fluid.Program()
     startup_prog = fluid.Program()
-    train_prog.random_seed = SEED
-    startup_prog.random_seed = SEED
+
     with fluid.program_guard(train_prog, startup_prog):
         with fluid.unique_name.guard():
             # define input and reader
@@ -128,8 +130,8 @@ def train_static(args, batch_generator):
 def train_dygraph(args, batch_generator):
     with fluid.dygraph.guard(place):
         if SEED is not None:
-            fluid.default_main_program().random_seed = SEED
-            fluid.default_startup_program().random_seed = SEED
+            paddle.manual_seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
         # define data loader
         train_loader = fluid.io.DataLoader.from_generator(capacity=10)
         train_loader.set_batch_generator(batch_generator, places=place)
@@ -220,7 +222,8 @@ def train_dygraph(args, batch_generator):
 
 def predict_dygraph(args, batch_generator):
     with fluid.dygraph.guard(place):
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         # define data loader
         test_loader = fluid.io.DataLoader.from_generator(capacity=10)
@@ -291,7 +294,8 @@ def predict_dygraph(args, batch_generator):
 def predict_static(args, batch_generator):
     test_prog = fluid.Program()
     with fluid.program_guard(test_prog):
-        test_prog.random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         # define input and reader
         input_field_names = util.encoder_data_input_fields + util.fast_decoder_data_input_fields
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
index 13a97fb7478db8acaa46db3b7a6c4341997193eb..bedca412157f0b4d125f75ee5eabd0145411451b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
@@ -20,7 +20,7 @@ import random
 import sys
 import time
 import unittest
-
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import declarative, ProgramTranslator, to_variable
 from paddle.fluid.dygraph.nn import Conv2D, BatchNorm, Linear, Pool2D
@@ -272,8 +272,8 @@ def train(args, fake_data_reader, to_static):
     random.seed(0)
     np.random.seed(0)
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = 1000
-        fluid.default_main_program().random_seed = 1000
+        paddle.manual_seed(1000)
+        paddle.framework.random._manual_program_seed(1000)
 
         video_model = TSM_ResNet("TSM", train_config, 'Train')
 
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
similarity index 62%
rename from python/paddle/fluid/tests/unittests/test_hdfs.py
rename to python/paddle/fluid/tests/unittests/hdfs_test_utils.py
index 75e2f5d679204c33f922ea8ee6be71a900c83cd6..6a752bc3053d7d0672bd0002250252c3bbbfa1e1 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs.py
+++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
@@ -24,7 +24,7 @@ from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFil
 java_home = os.environ["JAVA_HOME"]
 
 
-class FSTest(unittest.TestCase):
+class FSTestBase(unittest.TestCase):
     def _test_dirs(self, fs):
         dir_path = os.path.abspath("./test_dir")
         fs.delete(dir_path)
@@ -188,106 +188,6 @@ class FSTest(unittest.TestCase):
         except Exception as e:
             pass
 
-    def test_exists(self):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=15 * 1000,
-            sleep_inter=100)
-        self.assertFalse(fs.is_exist(os.path.abspath("./xxxx")))
-        self.assertFalse(fs.is_dir(os.path.abspath("./xxxx")))
-        self.assertTrue(fs.is_dir(os.path.abspath("./xxx/..")))
-        dirs, files = fs.ls_dir(os.path.abspath("./test_hdfs.py"))
-        self.assertTrue(dirs == [])
-        self.assertTrue(len(files) == 1)
-        dirs, files = fs.ls_dir(os.path.abspath("./xxx/.."))
-
-    def test_hdfs(self):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=15 * 1000,
-            sleep_inter=100)
-        self._test_rm(fs)
-        self._test_touch(fs)
-        self._test_dirs(fs)
-        self._test_upload(fs)
-
-        self._test_download(fs)
-        self._test_mkdirs(fs)
-        self._test_list_dir(fs)
-        self._test_try_upload(fs)
-        self._test_try_download(fs)
-
-    def test_local(self):
-        fs = LocalFS()
-        self._test_rm(fs)
-        self._test_touch(fs)
-        self._test_dirs(fs)
-        self._test_touch_file(fs)
-        self._test_mkdirs(fs)
-        self._test_list_dir(fs)
-        self._test_try_upload(fs)
-        self._test_try_download(fs)
-
-    def test_timeout(self):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=6 * 1000,
-            sleep_inter=100)
-        src = "hdfs_test_timeout"
-        dst = "new_hdfs_test_timeout"
-        fs.delete(dst)
-        fs.mkdirs(src)
-        fs.mkdirs(dst)
-        fs.mkdirs(dst + "/" + src)
-        output = ""
-        try:
-            fs.mv(src, dst, test_exists=False)
-            self.assertFalse(1, "can't execute cmd:{} output:{}".format(cmd,
-                                                                        output))
-        except FSTimeOut as e:
-            print("execute mv {} to {} timeout".format(src, dst))
-
-        cmd = "{} -mv {} {}".format(fs._base_cmd, src, dst)
-        ret, output = fluid.core.shell_execute_cmd(cmd, 6 * 1000, 2 * 1000)
-        self.assertNotEqual(ret, 0)
-        print("second mv ret:{} output:{}".format(ret, output))
-
-    def test_is_dir(self):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=15 * 1000,
-            sleep_inter=100)
-        self.assertFalse(fs.is_dir("./test_hdfs.py"))
-        s = """
-java.io.IOException: Input/output error
- responseErrorMsg : failed to getFileStatus, errorCode: 3, path: /user/PUBLIC_KM_Data/wangxi16/data/serving_model, lparam: d868f6bb6822c621, errorMessage: inner error
-	at org.apache.hadoop.util.FileSystemUtil.throwException(FileSystemUtil.java:164)
-	at org.apache.hadoop.util.FileSystemUtil.dealWithResponse(FileSystemUtil.java:118)
-	at org.apache.hadoop.lite.client.LiteClientImpl.getFileStatus(LiteClientImpl.java:696)
-	at org.apache.hadoop.fs.LibDFileSystemImpl.getFileStatus(LibDFileSystemImpl.java:297)
-	at org.apache.hadoop.fs.LiteFileSystem.getFileStatus(LiteFileSystem.java:514)
-	at org.apache.hadoop.fs.FsShell.test(FsShell.java:1092)
-	at org.apache.hadoop.fs.FsShell.run(FsShell.java:2285)
-	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
-	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:79)
-	at org.apache.hadoop.fs.FsShell.main(FsShell.java:2353)
-        """
-
-        print("split lines:", s.splitlines())
-        self.assertTrue(fs._test_match(s.splitlines()) != None)
-
-    def test_config(self):
-        config = {"fs.default.name": "hdfs://xxx", "hadoop.job.ugi": "ugi"}
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            config,
-            time_out=15 * 1000,
-            sleep_inter=100)
-
     def _test_list_dir(self, fs):
         fs = HDFSClient(
             "/usr/local/hadoop-2.7.7/",
diff --git a/python/paddle/fluid/tests/unittests/launch_function_helper.py b/python/paddle/fluid/tests/unittests/launch_function_helper.py
index 64fee35710ae1b8690ec41b247ceb55e180b13c9..13041827ffeabd3d6b79e4f34a67bd09624e54f6 100644
--- a/python/paddle/fluid/tests/unittests/launch_function_helper.py
+++ b/python/paddle/fluid/tests/unittests/launch_function_helper.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 from multiprocessing import Pool, Process
 import os
+import socket
+from contextlib import closing
+import psutil
 
 
 def launch_func(func, env_dict):
@@ -20,3 +23,31 @@ def launch_func(func, env_dict):
         os.environ[key] = env_dict[key]
     proc = Process(target=func)
     return proc
+
+
+def wait(procs, timeout=None):
+    # wait
+    decents = []
+    for p in procs:
+        for child in psutil.Process(p.pid).children(recursive=True):
+            decents.append(child)
+
+    gone, alive = psutil.wait_procs(decents, timeout=timeout)
+    for p in alive:
+        p.kill()
+    for p in gone:
+        if p.returncode != 0:
+            sys.exit(1)
+
+
+def _find_free_port(port_set):
+    def __free_port():
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.bind(('', 0))
+            return s.getsockname()[1]
+
+    while True:
+        port = __free_port()
+        if port not in port_set:
+            port_set.add(port)
+            return port
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
index 5e2059592b5170fdd623f4a20b9fa47612ff2a6a..1320623f8f8422f14677a3ca629735838dc94aa8 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
@@ -26,7 +26,7 @@ import paddle.fluid as fluid
 import paddle.fluid.dygraph as dygraph
 from paddle.fluid import core
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.nn import Conv2D, Pool2D, Linear, SyncBatchNorm
+from paddle.nn import Conv2d, Pool2D, Linear, SyncBatchNorm
 from paddle.fluid.dygraph.base import to_variable
 
 from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
@@ -42,26 +42,24 @@ class TestLayer(fluid.dygraph.Layer):
                  act=None):
         super(TestLayer, self).__init__()
 
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
+        self._conv = Conv2d(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
             stride=stride,
             padding=(filter_size - 1) // 2,
             groups=groups,
-            act=None,
             bias_attr=False)
 
         self._sync_batch_norm = SyncBatchNorm(num_filters)
 
-        self._conv2 = Conv2D(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=filter_size,
+        self._conv2 = Conv2d(
+            in_channels=num_filters,
+            out_channels=num_filters,
+            kernel_size=filter_size,
             stride=stride,
             padding=(filter_size - 1) // 2,
             groups=groups,
-            act=None,
             bias_attr=False)
 
         self._sync_batch_norm2 = SyncBatchNorm(
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index ef4779f0e6f2df2f0b79f776d1e7b6c5cbf31a22..9c3ed13cbb0002231888433a451c7d5e7188244d 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import multiprocessing
 import os
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid import compiler
@@ -34,7 +35,7 @@ class TestParallelExecutorBase(unittest.TestCase):
     def check_network_convergence(cls,
                                   method,
                                   use_cuda=True,
-                                  iter=50,
+                                  iter=5,
                                   batch_size=None,
                                   feed_dict=None,
                                   feed_data_reader=None,
@@ -64,10 +65,11 @@ class TestParallelExecutorBase(unittest.TestCase):
                 feed_data_reader, FeedDataReader
             ), "feed_data_reader must be type of FeedDataReader"
 
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main = fluid.Program()
         startup = fluid.Program()
-        startup.random_seed = 1
-        main.random_seed = 1
+
         with fluid.program_guard(main, startup):
             feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder,
                                               main, method, optimizer)
diff --git a/python/paddle/fluid/tests/unittests/parallel_test.sh b/python/paddle/fluid/tests/unittests/parallel_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9da4f035345d7f04b69a1c9483cba7022ad10baa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_test.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+unset https_proxy http_proxy
+export FLAGS_rpc_disable_reuse_port=1
+
+name=${TEST_TARGET_NAME}
+UnitTests=${UnitTests}
+TEST_TIMEOUT=${TEST_TIMEOUT}
+
+if [[ ${name}"x" == "x" ]]; then
+    echo "can't find name, please set TEST_TARGET_NAME first"
+    exit 1
+fi
+
+if [[ ${UnitTests}"x" == "x" ]]; then
+    echo "can't find UnitTests, please set TEST_TARGET_NAME first"
+    exit 1
+fi
+
+if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then
+    echo "can't find ${TEST_TIMEOUT}, please set ${TEST_TIMEOUT} first"
+    exit 1
+fi
+
+if [[ ${WITH_COVERAGE} == "ON" ]]; then
+    PYTHON_EXEC="python -u -m coverage run --branch -p "
+else
+    PYTHON_EXEC="python -u "
+fi
+
+run_time=$(( $TEST_TIMEOUT - 10 ))
+echo "run_time: ${run_time}"
+for ut in ${UnitTests}; do
+    echo "start ${ut}"
+    timeout -s SIGKILL ${run_time} ${PYTHON_EXEC} ./${ut}.py > ${ut}_run.log 2>&1 &
+done
+
+FAIL=0
+for job in `jobs -p`
+do
+    echo "jobs -p result:" `jobs -p`
+    echo $job
+    wait $job || let FAIL=FAIL+1
+done
+
+echo "fail_num:" $FAIL
+
+if [ "$FAIL" == "0" ];
+then
+    exit 0
+else
+    echo "FAIL! ($FAIL)"
+
+    for ut in ${UnitTests}; do
+        log=${ut}_run.log
+        echo "cat ${log}"
+        cat $log
+    done
+
+    exit 1
+fi
diff --git a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f71e04c09aa38b8cf7b3a167b84d4dc0e6cc3ec7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/dygraph/backward_strategy.py b/python/paddle/fluid/tests/unittests/rnn/__init__.py
similarity index 76%
rename from python/paddle/fluid/dygraph/backward_strategy.py
rename to python/paddle/fluid/tests/unittests/rnn/__init__.py
index bfcf66af31ce13b3394b5b091882b1976f9f003a..abf198b97e6e818e1fbe59006f98492640bcee54 100644
--- a/python/paddle/fluid/dygraph/backward_strategy.py
+++ b/python/paddle/fluid/tests/unittests/rnn/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,9 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from paddle.fluid import core
-
-__all__ = ["BackwardStrategy"]
-
-BackwardStrategy = core.BackwardStrategy
diff --git a/python/paddle/fluid/tests/unittests/rnn/convert.py b/python/paddle/fluid/tests/unittests/rnn/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..02f10694a4b47e8a58e2fd0db4453cafedcbbdc1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/convert.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+
+
+def convert_params_for_cell(np_cell, paddle_cell):
+    state = np_cell.parameters
+    for k, v in paddle_cell.named_parameters():
+        v.set_value(state[k])
+
+
+def convert_params_for_cell_static(np_cell, paddle_cell, place):
+    state = np_cell.parameters
+    for k, v in paddle_cell.named_parameters():
+        scope = paddle.static.global_scope()
+        tensor = scope.find_var(v.name).get_tensor()
+        tensor.set(state[k], place)
+
+
+def convert_params_for_net(np_net, paddle_net):
+    for np_layer, paddle_layer in zip(np_net, paddle_net):
+        if hasattr(np_layer, "cell"):
+            convert_params_for_cell(np_layer.cell, paddle_layer.cell)
+        else:
+            convert_params_for_cell(np_layer.cell_fw, paddle_layer.cell_fw)
+            convert_params_for_cell(np_layer.cell_bw, paddle_layer.cell_bw)
+
+
+def convert_params_for_net_static(np_net, paddle_net, place):
+    for np_layer, paddle_layer in zip(np_net, paddle_net):
+        if hasattr(np_layer, "cell"):
+            convert_params_for_cell_static(np_layer.cell, paddle_layer.cell,
+                                           place)
+        else:
+            convert_params_for_cell_static(np_layer.cell_fw,
+                                           paddle_layer.cell_fw, place)
+            convert_params_for_cell_static(np_layer.cell_bw,
+                                           paddle_layer.cell_bw, place)
diff --git a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e0b8374b95cf334b4eced550a79d7c717c07aa7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
@@ -0,0 +1,516 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+
+class LayerMixin(object):
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+
+class LayerListMixin(LayerMixin):
+    def __init__(self, layers=None):
+        self._layers = list(layers) if layers else []
+
+    def append(self, layer):
+        self._layers.append(layer)
+
+    def __iter__(self):
+        return iter(self._layers)
+
+
+class SimpleRNNCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        if nonlinearity == 'tanh':
+            self.nonlinearity = np.tanh
+        else:
+            self.nonlinearity = lambda x: np.maximum(x, 0.)
+
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.random.uniform(-std, std, (
+            hidden_size, input_size)).astype('float64')
+        self.weight_hh = np.random.uniform(-std, std, (
+            hidden_size, hidden_size)).astype('float64')
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.random.uniform(-std, std,
+                                             (hidden_size, )).astype('float64')
+            self.bias_hh = np.random.uniform(-std, std,
+                                             (hidden_size, )).astype('float64')
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        return np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_h = hx
+        i2h = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            i2h += self.bias_ih
+        h2h = np.matmul(pre_h, self.weight_hh.T)
+        if self.bias_hh is not None:
+            h2h += self.bias_hh
+        h = self.nonlinearity(i2h + h2h)
+        return h, h
+
+
+class GRUCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.random.uniform(-std, std, (
+            3 * hidden_size, input_size)).astype('float64')
+        self.weight_hh = np.random.uniform(-std, std, (
+            3 * hidden_size, hidden_size)).astype('float64')
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.random.uniform(-std, std, (
+                3 * hidden_size)).astype('float64')
+            self.bias_hh = np.random.uniform(-std, std, (
+                3 * hidden_size)).astype('float64')
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        return np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_hidden = hx
+        x_gates = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            x_gates = x_gates + self.bias_ih
+        h_gates = np.matmul(pre_hidden, self.weight_hh.T)
+        if self.bias_hh is not None:
+            h_gates = h_gates + self.bias_hh
+
+        x_r, x_z, x_c = np.split(x_gates, 3, 1)
+        h_r, h_z, h_c = np.split(h_gates, 3, 1)
+
+        r = 1.0 / (1.0 + np.exp(-(x_r + h_r)))
+        z = 1.0 / (1.0 + np.exp(-(x_z + h_z)))
+        c = np.tanh(x_c + r * h_c)  # apply reset gate after mm
+        h = (pre_hidden - c) * z + c
+        return h, h
+
+
+class LSTMCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.random.uniform(-std, std, (
+            4 * hidden_size, input_size)).astype('float64')
+        self.weight_hh = np.random.uniform(-std, std, (
+            4 * hidden_size, hidden_size)).astype('float64')
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.random.uniform(-std, std, (
+                4 * hidden_size)).astype('float64')
+            self.bias_hh = np.random.uniform(-std, std, (
+                4 * hidden_size)).astype('float64')
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        init_h = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+        init_c = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+        return init_h, init_c
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_hidden, pre_cell = hx
+        gates = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            gates = gates + self.bias_ih
+        gates += np.matmul(pre_hidden, self.weight_hh.T)
+        if self.bias_hh is not None:
+            gates = gates + self.bias_hh
+
+        chunked_gates = np.split(gates, 4, -1)
+
+        i = 1.0 / (1.0 + np.exp(-chunked_gates[0]))
+        f = 1.0 / (1.0 + np.exp(-chunked_gates[1]))
+        o = 1.0 / (1.0 + np.exp(-chunked_gates[3]))
+        c = f * pre_cell + i * np.tanh(chunked_gates[2])
+        h = o * np.tanh(c)
+
+        return h, (h, c)
+
+
+def sequence_mask(lengths, max_len=None):
+    if max_len is None:
+        max_len = np.max(lengths)
+    else:
+        assert max_len >= np.max(lengths)
+    return np.arange(max_len) < np.expand_dims(lengths, -1)
+
+
+def update_state(mask, new, old):
+    if not isinstance(old, (tuple, list)):
+        return np.where(mask, new, old)
+    else:
+        return tuple(map(lambda x, y: np.where(mask, x, y), new, old))
+
+
+def rnn(cell,
+        inputs,
+        initial_states,
+        sequence_length=None,
+        time_major=False,
+        is_reverse=False):
+    if not time_major:
+        inputs = np.transpose(inputs, [1, 0, 2])
+    if is_reverse:
+        inputs = np.flip(inputs, 0)
+
+    if sequence_length is None:
+        mask = None
+    else:
+        mask = np.transpose(sequence_mask(sequence_length), [1, 0])
+        mask = np.expand_dims(mask, -1)
+        if is_reverse:
+            mask = np.flip(mask, 0)
+
+    time_steps = inputs.shape[0]
+    state = initial_states
+    outputs = []
+    for t in range(time_steps):
+        x_t = inputs[t]
+        if mask is not None:
+            m_t = mask[t]
+            y, new_state = cell(x_t, state)
+            y = np.where(m_t, y, 0.)
+            outputs.append(y)
+            state = update_state(m_t, new_state, state)
+        else:
+            y, new_state = cell(x_t, state)
+            outputs.append(y)
+            state = new_state
+
+    outputs = np.stack(outputs)
+    final_state = state
+
+    if is_reverse:
+        outputs = np.flip(outputs, 0)
+    if not time_major:
+        outputs = np.transpose(outputs, [1, 0, 2])
+    return outputs, final_state
+
+
+def birnn(cell_fw,
+          cell_bw,
+          inputs,
+          initial_states,
+          sequence_length=None,
+          time_major=False):
+    states_fw, states_bw = initial_states
+    outputs_fw, states_fw = rnn(cell_fw,
+                                inputs,
+                                states_fw,
+                                sequence_length,
+                                time_major=time_major)
+
+    outputs_bw, states_bw = rnn(cell_bw,
+                                inputs,
+                                states_bw,
+                                sequence_length,
+                                time_major=time_major,
+                                is_reverse=True)
+
+    outputs = np.concatenate((outputs_fw, outputs_bw), -1)
+    final_states = (states_fw, states_bw)
+    return outputs, final_states
+
+
+def flatten(nested):
+    return list(_flatten(nested))
+
+
+def _flatten(nested):
+    for item in nested:
+        if isinstance(item, (list, tuple)):
+            for subitem in _flatten(item):
+                yield subitem
+        else:
+            yield item
+
+
+def unstack(array, axis=0):
+    num = array.shape[axis]
+    sub_arrays = np.split(array, num, axis)
+    return [np.squeeze(sub_array, axis) for sub_array in sub_arrays]
+
+
+def dropout(array, p=0.5):
+    if p == 0.0:
+        return array
+
+    mask = (np.random.uniform(size=array.shape) < (1 - p)).astype(array.dtype)
+    return array * (mask / (1 - p))
+
+
+def split_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        states = unstack(states)
+        if not bidirectional:
+            return states
+        else:
+            return list(zip(states[::2], states[1::2]))
+    else:
+        assert len(states) == state_components
+        states = tuple([unstack(item) for item in states])
+        if not bidirectional:
+            return list(zip(*states))
+        else:
+            states = list(zip(*states))
+            return list(zip(states[::2], states[1::2]))
+
+
+def concat_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        return np.stack(flatten(states))
+    else:
+        states = flatten(states)
+        componnets = []
+        for i in range(state_components):
+            componnets.append(states[i::state_components])
+        return [np.stack(item) for item in componnets]
+
+
+class RNN(LayerMixin):
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            # for non-dygraph mode, `rnn` api uses cell.call
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        final_outputs, final_states = rnn(self.cell,
+                                          inputs,
+                                          initial_states=initial_states,
+                                          sequence_length=sequence_length,
+                                          time_major=self.time_major,
+                                          is_reverse=self.is_reverse)
+        return final_outputs, final_states
+
+
+class BiRNN(LayerMixin):
+    def __init__(self, cell_fw, cell_bw, time_major=False):
+        super(BiRNN, self).__init__()
+        self.cell_fw = cell_fw
+        self.cell_bw = cell_bw
+        self.time_major = time_major
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        if isinstance(initial_states, (list, tuple)):
+            assert len(initial_states) == 2, \
+                "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+
+        outputs, final_states = birnn(self.cell_fw, self.cell_bw, inputs,
+                                      initial_states, sequence_length,
+                                      self.time_major)
+        return outputs, final_states
+
+
+class RNNMixin(LayerListMixin):
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        batch_index = 1 if self.time_major else 0
+        batch_size = inputs.shape[batch_index]
+        dtype = inputs.dtype
+        if initial_states is None:
+            state_shape = (self.num_layers * self.num_directions, batch_size,
+                           self.hidden_size)
+            if self.state_components == 1:
+                initial_states = np.zeros(state_shape, dtype)
+            else:
+                initial_states = tuple([
+                    np.zeros(state_shape, dtype)
+                    for _ in range(self.state_components)
+                ])
+
+        states = split_states(initial_states, self.num_directions == 2,
+                              self.state_components)
+        final_states = []
+
+        for i, rnn_layer in enumerate(self):
+            if i > 0:
+                inputs = dropout(inputs, self.dropout)
+            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+            final_states.append(final_state)
+            inputs = outputs
+
+        final_states = concat_states(final_states, self.num_directions == 2,
+                                     self.state_components)
+        return outputs, final_states
+
+
+class SimpleRNN(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 nonlinearity="tanh",
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(SimpleRNN, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = SimpleRNNCell(2 * hidden_size, hidden_size,
+                                        nonlinearity)
+                cell_bw = SimpleRNNCell(2 * hidden_size, hidden_size,
+                                        nonlinearity)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
+
+
+class LSTM(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(LSTM, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = LSTMCell(input_size, hidden_size)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = LSTMCell(hidden_size, hidden_size)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = LSTMCell(input_size, hidden_size)
+            cell_bw = LSTMCell(input_size, hidden_size)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = LSTMCell(2 * hidden_size, hidden_size)
+                cell_bw = LSTMCell(2 * hidden_size, hidden_size)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 2
+
+
+class GRU(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(GRU, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = GRUCell(input_size, hidden_size)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = GRUCell(hidden_size, hidden_size)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = GRUCell(input_size, hidden_size)
+            cell_bw = GRUCell(input_size, hidden_size)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = GRUCell(2 * hidden_size, hidden_size)
+                cell_bw = GRUCell(2 * hidden_size, hidden_size)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d2677229a03f7bdac14a93e176747ba0a5f1d6b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.framework.set_default_dtype("float64")
+
+import numpy as np
+import unittest
+
+from rnn_numpy import SimpleRNNCell, LSTMCell, GRUCell
+from convert import convert_params_for_cell
+
+
+class TestSimpleRNNCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestSimpleRNNCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = SimpleRNNCell(16, 32, bias=self.bias)
+        rnn2 = paddle.nn.SimpleRNNCell(
+            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        convert_params_for_cell(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestGRUCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestGRUCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = GRUCell(16, 32, bias=self.bias)
+        rnn2 = paddle.nn.GRUCell(
+            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        convert_params_for_cell(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestLSTMCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestLSTMCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = LSTMCell(16, 32, bias=self.bias)
+        rnn2 = paddle.nn.LSTMCell(
+            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        convert_params_for_cell(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+        prev_c = np.random.randn(4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+        y2, (h2, c2) = rnn2(
+            paddle.to_variable(x),
+            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+
+        y1, (h1, c1) = rnn1(x)
+        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
+    for bias in [True, False]:
+        for device in devices:
+            for test_class in [TestSimpleRNNCell, TestGRUCell, TestLSTMCell]:
+                suite.addTest(test_class(bias, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..948e47d5b99462c363015936f84058e222d548e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
@@ -0,0 +1,326 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.framework.set_default_dtype("float64")
+
+import numpy as np
+import unittest
+
+from convert import convert_params_for_cell_static
+from rnn_numpy import SimpleRNNCell, LSTMCell, GRUCell
+
+
+class TestSimpleRNNCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestSimpleRNNCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = SimpleRNNCell(16, 32, bias=self.bias)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.SimpleRNNCell(
+                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_cell_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp,
+                             feed=feed_dict,
+                             fetch_list=[y, h],
+                             use_prune=True)
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestGRUCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestGRUCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = GRUCell(16, 32, bias=self.bias)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.GRUCell(
+                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_cell_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp,
+                             feed=feed_dict,
+                             fetch_list=[y, h],
+                             use_prune=True)
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestLSTMCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestLSTMCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = LSTMCell(16, 32, bias=self.bias)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.LSTMCell(
+                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_cell_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+        prev_c = np.random.randn(4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                init_c = paddle.data(
+                    "init_c", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data, (init_h, init_c))
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h, init_c.name: prev_c}
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+
+        y1, (h1, c1) = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp,
+                                 feed=feed_dict,
+                                 fetch_list=[y, h, c],
+                                 use_prune=True)
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
+    for bias in [True, False]:
+        for device in devices:
+            for test_class in [TestSimpleRNNCell, TestGRUCell, TestLSTMCell]:
+                suite.addTest(test_class(bias, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef297b3bb62497073fd667238cae8a83daaa4967
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.set_default_dtype("float64")
+from paddle.fluid.layers import sequence_mask
+
+import numpy as np
+import unittest
+
+from convert import convert_params_for_net
+from rnn_numpy import SimpleRNN, LSTM, GRU
+
+
+class TestSimpleRNN(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestSimpleRNN, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = SimpleRNN(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        rnn2 = paddle.nn.SimpleRNN(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_variable(sequence_length)
+        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2 = paddle.multiply(y2, mask, axis=0)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+class TestGRU(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestGRU, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = GRU(16,
+                   32,
+                   2,
+                   time_major=self.time_major,
+                   direction=self.direction)
+        rnn2 = paddle.nn.GRU(16,
+                             32,
+                             2,
+                             time_major=self.time_major,
+                             direction=self.direction)
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_variable(sequence_length)
+        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2 = paddle.multiply(y2, mask, axis=0)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+class TestLSTM(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestLSTM, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        rnn2 = paddle.nn.LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+        prev_c = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+        y2, (h2, c2) = rnn2(
+            paddle.to_variable(x),
+            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, (h1, c1) = rnn1(x)
+        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_variable(sequence_length)
+        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, (h2, c2) = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2 = paddle.multiply(y2, mask, axis=0)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
+    for direction in ["forward", "backward", "bidirectional"]:
+        for time_major in [True, False]:
+            for device in devices:
+                for test_class in [TestSimpleRNN, TestLSTM, TestGRU]:
+                    suite.addTest(test_class(time_major, direction, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..90ed6b8b4c9075f5a3e3925bb80e24c81a37869c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
@@ -0,0 +1,470 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.set_default_dtype("float64")
+from paddle.fluid.layers import sequence_mask
+
+import numpy as np
+import unittest
+
+from convert import convert_params_for_net_static
+from rnn_numpy import SimpleRNN, LSTM, GRU
+
+
+class TestSimpleRNN(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestSimpleRNN, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = SimpleRNN(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.SimpleRNN(
+                    16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone().clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, h = rnn2(x_data, sequence_length=seq_len)
+                y = paddle.multiply(y, mask, axis=0)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+class TestGRU(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestGRU, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = GRU(16,
+                   32,
+                   2,
+                   time_major=self.time_major,
+                   direction=self.direction)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.GRU(16,
+                                     32,
+                                     2,
+                                     time_major=self.time_major,
+                                     direction=self.direction)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, h = rnn2(x_data, sequence_length=seq_len)
+                y = paddle.multiply(y, mask, axis=0)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestLSTM(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestLSTM, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.LSTM(
+                    16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+        prev_c = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                init_c = paddle.data(
+                    "init_c", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data, (init_h, init_c))
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h, init_c.name: prev_c}
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, (h1, c1) = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, (h, c) = rnn2(x_data, sequence_length=seq_len)
+                y = paddle.multiply(y, mask, axis=0)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
+    for direction in ["forward", "backward", "bidirectional"]:
+        for time_major in [True, False]:
+            for device in devices:
+                for test_class in [TestSimpleRNN, TestLSTM, TestGRU]:
+                    suite.addTest(test_class(time_major, direction, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
index 17e0cd0d5b18652f828af9936b07cb4122f87b97..45d39afc115d292fd79a3bbc4f609ad080f74602 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -36,7 +36,7 @@ remove_dropout = False
 # and Executor is different.
 remove_bn = False
 
-remove_cudnn_conv = False
+remove_cudnn_conv = True
 
 remove_dropout = True
 remove_bn = True
@@ -179,7 +179,7 @@ def batch_size(use_cuda):
 def iter(use_cuda):
     if use_cuda:
         return 10
-    return 2
+    return 1
 
 
 gpu_img, gpu_label = init_data(
diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..278d7b27c528803211e21ae7b1f1190e3053bcc4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division
+
+import numpy as np
+import unittest
+
+import paddle
+
+# used by model.run_trainer in test_dist_base
+from test_dist_base import RUN_STEP
+
+
+# NOTE: compatible TestParallelDyGraphRunnerBase args
+class SpawnAssistTestArgs(object):
+    update_method = "local"
+    trainer_id = 0
+
+
+class TestDistSpawnRunner(unittest.TestCase):
+    def setUp(self):
+        # NOTE(chenweihang): keep consistent with
+        # TestDistBase.check_with_place
+        self.nprocs = 2
+
+    def _run(self, model, args):
+        args.update_method = "local"
+        return model.run_trainer_with_spawn(args)
+
+    def _run_parallel(self, model, args):
+        args.update_method = "nccl2"
+        context = paddle.distributed.spawn(
+            func=model.run_trainer_with_spawn,
+            args=(args, ),
+            nprocs=self.nprocs,
+            join=True)
+        result_list = []
+        for res_queue in context.return_queues:
+            result_list.append(res_queue.get())
+        return result_list
+
+    def check_dist_result_with_spawn(self, test_class, delta=1e-3):
+        # 0. prepare model and args
+        model = test_class()
+        args = SpawnAssistTestArgs()
+
+        # 1. calc signal card loss
+        losses = self._run(model, args)
+
+        # 2. calc multi card loss (nccl mode)
+        dist_losses_list = self._run_parallel(model, args)
+
+        # 3. compare losses
+        for step_id in range(RUN_STEP):
+            loss = losses[step_id]
+            dist_loss_sum = None
+            for dist_losses in dist_losses_list:
+                if dist_loss_sum is None:
+                    dist_loss_sum = np.array(dist_losses[step_id])
+                else:
+                    dist_loss_sum += np.array(dist_losses[step_id])
+            dist_loss = dist_loss_sum / self.nprocs
+            self.assertAlmostEqual(
+                loss,
+                dist_loss,
+                delta=delta,
+                msg="The results of single-card execution and multi-card execution are inconsistent."
+                "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n".
+                format(loss, dist_loss))
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
old mode 100644
new mode 100755
index aef49df577fe5489c2ee096cc54ce070a72da312..ab61a5b3cfccb0e885debe9786ae91a9754e9345
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -191,6 +191,59 @@ class TestTanh(TestActivation, TestParameter):
         self.dtype = np.float32
 
 
+class TestTanhAPI(unittest.TestCase):
+    # test paddle.tanh, paddle.nn.tanh, paddle.nn.functional.tanh
+    def setUp(self):
+        self.dtype = 'float32'
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12], self.dtype)
+            out1 = F.tanh(x)
+            th = paddle.nn.Tanh()
+            out2 = th(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = np.tanh(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x_np)
+        out1 = F.tanh(x)
+        out2 = paddle.tanh(x)
+        th = paddle.nn.Tanh()
+        out3 = th(x)
+        out_ref = np.tanh(self.x_np)
+        for r in [out1, out2, out3]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', [10, 12], self.dtype)
+            out = fluid.layers.tanh(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = np.tanh(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.tanh, 1)
+            # The input dtype must be float16, float32.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.tanh, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.tanh(x_fp16)
+
+
 class TestAtan(TestActivation, TestParameter):
     def setUp(self):
         self.op_type = "atan"
@@ -369,15 +422,20 @@ class TestCoshOpError(unittest.TestCase):
             fluid.layers.cosh(x_fp16)
 
 
-class TestTanhShrink(TestActivation):
+def ref_tanhshrink(x):
+    out = x - np.tanh(x)
+    return out
+
+
+class TestTanhshrink(TestActivation):
     def setUp(self):
         self.op_type = "tanh_shrink"
         self.init_dtype()
 
-        x = np.random.uniform(0.1, 1, [10, 17]).astype(self.dtype)
-        out = x - np.tanh(x)
+        x = np.random.uniform(10, 20, [10, 17]).astype(self.dtype)
+        out = ref_tanhshrink(x)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -386,6 +444,57 @@ class TestTanhShrink(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+class TestTanhshrinkAPI(unittest.TestCase):
+    # test paddle.nn.Tanhshrink, paddle.nn.functional.tanhshrink
+    def setUp(self):
+        self.x_np = np.random.uniform(10, 20, [10, 17]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.tanhshrink(x)
+            tanhshrink = paddle.nn.Tanhshrink()
+            out2 = tanhshrink(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_tanhshrink(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.tanhshrink(x)
+        tanhshrink = paddle.nn.Tanhshrink()
+        out2 = tanhshrink(x)
+        out_ref = ref_tanhshrink(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.tanh_shrink(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_tanhshrink(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.tanhshrink, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.tanhshrink, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.tanhshrink(x_fp16)
+
+
 def ref_hardshrink(x, threshold):
     out = np.copy(x)
     out[(out >= -threshold) & (out <= threshold)] = 0
@@ -397,20 +506,29 @@ class TestHardShrink(TestActivation):
         self.op_type = "hard_shrink"
         self.init_dtype()
 
-        threshold = 0.5
+        self.threshold = 0.5
+        self.set_attrs()
         x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) * 10
-        out = ref_hardshrink(x, threshold)
+        out = ref_hardshrink(x, self.threshold)
 
-        self.attrs = {'threshold': threshold}
+        self.attrs = {'threshold': self.threshold}
         self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
+    def set_attrs(self):
+        pass
+
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
         self.check_grad(['X'], 'Out')
 
 
+class TestHardShrink_threshold_negative(TestHardShrink):
+    def set_attrs(self):
+        self.threshold = -0.1
+
+
 class TestHardShrinkAPI(unittest.TestCase):
     # test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink
     def setUp(self):
@@ -469,19 +587,81 @@ class TestHardShrinkAPI(unittest.TestCase):
             F.hardshrink(x_fp16)
 
 
-class TestSoftShrink(TestActivation):
+def ref_hardtanh(x, min=-1.0, max=1.0):
+    out = np.copy(x)
+    out[np.abs(x - min) < 0.005] = min + 0.02
+    out[np.abs(x - max) < 0.005] = max + 0.02
+    out = np.minimum(np.maximum(x, min), max)
+    return out
+
+
+class TestHardtanhAPI(unittest.TestCase):
+    # test paddle.nn.Hardtanh, paddle.nn.functional.hardtanh
+    def setUp(self):
+        self.x_np = np.random.uniform(-3, 3, [10, 12]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12])
+            out1 = F.hardtanh(x)
+            m = paddle.nn.Hardtanh()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_hardtanh(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x_np)
+        out1 = F.hardtanh(x)
+        m = paddle.nn.Hardtanh()
+        out2 = m(x)
+        out_ref = ref_hardtanh(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.hardtanh(x, -2.0, 2.0)
+        m = paddle.nn.Hardtanh(-2.0, 2.0)
+        out2 = m(x)
+        out_ref = ref_hardtanh(self.x_np, -2.0, 2.0)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.hardtanh, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.hardtanh, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.hardtanh(x_fp16)
+
+
+def ref_softshrink(x, threshold=0.5):
+    out = np.copy(x)
+    out = (out < -threshold) * (out + threshold) + (out > threshold) * (
+        out - threshold)
+    return out
+
+
+class TestSoftshrink(TestActivation):
     def setUp(self):
         self.op_type = "softshrink"
         self.init_dtype()
 
-        lambda_val = 0.1
-        x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype)
-        out = np.copy(x)
-        out = (out < -lambda_val) * (out + lambda_val) + (out > lambda_val) * (
-            out - lambda_val)
+        threshold = 0.8
 
-        self.attrs = {'lambda': lambda_val}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype)
+        out = ref_softshrink(x, threshold)
+        self.inputs = {'X': x}
+        self.attrs = {"lambda": threshold}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -490,17 +670,59 @@ class TestSoftShrink(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
-class TestSoftShrinkOpError(unittest.TestCase):
+class TestSoftshrinkAPI(unittest.TestCase):
+    # test paddle.nn.Softshrink, paddle.nn.functional.softshrink
+    def setUp(self):
+        self.threshold = 0.8
+        self.x_np = np.random.uniform(0.25, 10, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.softshrink(x, self.threshold)
+            softshrink = paddle.nn.Softshrink(self.threshold)
+            out2 = softshrink(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_softshrink(self.x_np, self.threshold)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.softshrink(x, self.threshold)
+        softshrink = paddle.nn.Softshrink(self.threshold)
+        out2 = softshrink(x)
+        out_ref = ref_softshrink(self.x_np, self.threshold)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.softshrink(x, self.threshold)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_softshrink(self.x_np, self.threshold)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.softshrink, 1)
+            self.assertRaises(TypeError, F.softshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.softshrink, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.softshrink, x_int32)
+            # The threshold must be no less than zero
+            x_fp32 = paddle.data(name='x_fp32', shape=[12, 10], dtype='float32')
+            self.assertRaises(ValueError, F.softshrink, x_fp32, -1.0)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.softshrink(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.softshrink(x_fp16)
 
 
 class TestSqrt(TestActivation, TestParameter):
@@ -737,18 +959,30 @@ class TestReluAPI(unittest.TestCase):
             F.relu(x_fp16)
 
 
+def ref_leaky_relu(x, alpha=0.01):
+    out = np.copy(x)
+    out[out < 0] *= alpha
+    return out
+
+
 class TestLeakyRelu(TestActivation):
+    def get_alpha(self):
+        return 0.02
+
     def setUp(self):
         self.op_type = "leaky_relu"
         self.init_dtype()
+        alpha = self.get_alpha()
 
+        np.random.seed(10)
         x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
         # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0.02 * x)
+        x[np.abs(x) < 0.005] = 0.05
+        out = ref_leaky_relu(x, alpha)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
+        self.attrs = {'alpha': alpha}
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -756,18 +990,78 @@ class TestLeakyRelu(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
-class TestLeakyReluOpError(unittest.TestCase):
+class TestLeakyReluAlpha1(TestLeakyRelu):
+    def get_alpha(self):
+        return 2
+
+
+class TestLeakyReluAlpha2(TestLeakyRelu):
+    def get_alpha(self):
+        return -0.01
+
+
+class TestLeakyReluAlpha3(TestLeakyRelu):
+    def get_alpha(self):
+        return -2.0
+
+
+class TestLeakyReluAPI(unittest.TestCase):
+    # test paddle.nn.LeakyReLU, paddle.nn.functional.leaky_relu,
+    # fluid.layers.leaky_relu
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12])
+            out1 = F.leaky_relu(x)
+            m = paddle.nn.LeakyReLU()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_leaky_relu(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x_np)
+        out1 = F.leaky_relu(x)
+        m = paddle.nn.LeakyReLU()
+        out2 = m(x)
+        out_ref = ref_leaky_relu(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.leaky_relu(x, 0.6)
+        m = paddle.nn.LeakyReLU(0.6)
+        out2 = m(x)
+        out_ref = ref_leaky_relu(self.x_np, 0.6)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', [10, 12])
+            out = fluid.layers.leaky_relu(x, 0.01)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_leaky_relu(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.leaky_relu, 1)
+            self.assertRaises(TypeError, F.leaky_relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.leaky_relu, x_int32)
-            # support the input dtype is float32
-            x_fp16 = fluid.layers.data(
-                name='x_fp16', shape=[12, 10], dtype='float32')
-            fluid.layers.leaky_relu(x_fp16)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.leaky_relu, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.leaky_relu(x_fp16)
 
 
 def gelu(x, approximate):
@@ -903,20 +1197,24 @@ class TestBReluOpError(unittest.TestCase):
             fluid.layers.brelu(x_fp16)
 
 
+def ref_relu6(x, threshold=6.0):
+    out = np.copy(x)
+    out[np.abs(x - threshold) < 0.005] = threshold + 0.02
+    out = np.minimum(np.maximum(x, 0), threshold)
+    return out
+
+
 class TestRelu6(TestActivation):
     def setUp(self):
         self.op_type = "relu6"
         self.init_dtype()
 
         x = np.random.uniform(-1, 10, [10, 12]).astype(self.dtype)
-        threshold = 6.0
-        # The same with TestAbs
         x[np.abs(x) < 0.005] = 0.02
-        x[np.abs(x - threshold) < 0.005] = threshold + 0.02
-        out = np.minimum(np.maximum(x, 0), threshold)
+        out = ref_relu6(x)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'threshold': threshold}
+        self.inputs = {'X': x}
+        self.attrs = {'threshold': 6.0}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -925,17 +1223,56 @@ class TestRelu6(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
-class TestRelu6OpError(unittest.TestCase):
+class TestRelu6API(unittest.TestCase):
+    # test paddle.nn.ReLU6, paddle.nn.functional.relu6
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 10, [10, 12]).astype(np.float64)
+        self.x_np[np.abs(self.x_np) < 0.005] = 0.02
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.relu6(x)
+            relu6 = paddle.nn.ReLU6()
+            out2 = relu6(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_relu6(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.relu6(x)
+        relu6 = paddle.nn.ReLU6()
+        out2 = relu6(x)
+        out_ref = ref_relu6(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.relu6(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_relu6(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.relu6, 1)
+            self.assertRaises(TypeError, F.relu6, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.relu6, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.relu6, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.relu6(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.relu6(x_fp16)
 
 
 class TestHardSwish(TestActivation):
@@ -1318,16 +1655,25 @@ class TestSTanhOpError(unittest.TestCase):
             fluid.layers.stanh(x_fp16)
 
 
+def ref_softplus(x, beta=1, threshold=20):
+    x_beta = beta * x
+    out = np.select([x_beta <= threshold, x_beta > threshold],
+                    [np.log(1 + np.exp(x_beta)) / beta, x])
+    return out
+
+
 class TestSoftplus(TestActivation):
     def setUp(self):
         self.op_type = "softplus"
         self.init_dtype()
-        self.dtype = np.float64
 
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = np.log(1 + np.exp(x))
+        beta = 2
+        threshold = 15
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        out = ref_softplus(x, beta, threshold)
+        self.inputs = {'X': x}
+        self.attrs = {'beta': beta, "threshold": threshold}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -1336,15 +1682,72 @@ class TestSoftplus(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+class TestSoftplusAPI(unittest.TestCase):
+    # test paddle.nn.Softplus, paddle.nn.functional.softplus
+    def setUp(self):
+        self.beta = 2
+        self.threshold = 15
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.softplus(x, self.beta, self.threshold)
+            softplus = paddle.nn.Softplus(self.beta, self.threshold)
+            out2 = softplus(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_softplus(self.x_np, self.beta, self.threshold)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.softplus(x, self.beta, self.threshold)
+        softplus = paddle.nn.Softplus(self.beta, self.threshold)
+        out2 = softplus(x)
+        out_ref = ref_softplus(self.x_np, self.beta, self.threshold)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.softplus(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_softplus(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.softplus, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.softplus, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.softplus(x_fp16)
+
+
+def ref_softsign(x):
+    out = np.divide(x, 1 + np.abs(x))
+    return out
+
+
 class TestSoftsign(TestActivation):
     def setUp(self):
         self.op_type = "softsign"
         self.init_dtype()
 
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = np.divide(x, 1 + np.abs(x))
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        out = ref_softsign(x)
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -1353,6 +1756,57 @@ class TestSoftsign(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+class TestSoftsignAPI(unittest.TestCase):
+    # test paddle.nn.Softsign, paddle.nn.functional.softsign
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.softsign(x)
+            softsign = paddle.nn.Softsign()
+            out2 = softsign(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_softsign(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.softsign(x)
+        softsign = paddle.nn.Softsign()
+        out2 = softsign(x)
+        out_ref = ref_softsign(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.softsign(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_softsign(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.softsign, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.softsign, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.softsign(x_fp16)
+
+
 class TestThresholdedRelu(TestActivation):
     def setUp(self):
         self.op_type = "thresholded_relu"
@@ -1548,9 +2002,9 @@ create_test_act_fp16_class(TestActivation)
 create_test_act_fp16_class(TestSigmoid)
 create_test_act_fp16_class(TestLogSigmoid)
 create_test_act_fp16_class(TestTanh)
-create_test_act_fp16_class(TestTanhShrink)
+create_test_act_fp16_class(TestTanhshrink)
 create_test_act_fp16_class(TestHardShrink)
-create_test_act_fp16_class(TestSoftShrink)
+create_test_act_fp16_class(TestSoftshrink)
 create_test_act_fp16_class(TestSqrt)
 create_test_act_fp16_class(TestAbs)
 create_test_act_fp16_class(TestCeil, grad_check=False)
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 7a7099b7113c8233fb94074519386f9e4270a019..d4aafcd27a5aceb3c0b5fa9ddf8343d404bddbf5 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -20,6 +20,7 @@ from op_test import OpTest
 from paddle.fluid import core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
+import paddle
 
 
 class TestAdamOp1(OpTest):
@@ -401,46 +402,107 @@ class TestAdamOpBetaVariable(OpTest):
         self.check_output()
 
 
-class TestAdamOptimizerBetaVariable(unittest.TestCase):
-    def test_adam_optimizer(self):
-        def test_with_place(place, shape):
-            exe = fluid.Executor(place)
-
-            train_prog = fluid.Program()
-            startup = fluid.Program()
-            with fluid.program_guard(train_prog, startup):
-                with fluid.unique_name.guard():
-                    data = fluid.data(name="data", shape=shape)
-                    conv = fluid.layers.conv2d(data, 8, 3)
-                    loss = fluid.layers.reduce_mean(conv)
-
-                    beta1 = fluid.layers.create_global_var(
-                        shape=[1],
-                        value=0.85,
-                        dtype='float32',
-                        persistable=True)
-                    beta2 = fluid.layers.create_global_var(
-                        shape=[1],
-                        value=0.95,
-                        dtype='float32',
-                        persistable=True)
-                    opt = fluid.optimizer.Adam(
-                        learning_rate=1e-5, beta1=beta1, beta2=beta2)
-                    opt.minimize(loss)
-
-            exe.run(startup)
-            data_np = np.random.random(shape).astype('float32')
-            rets = exe.run(train_prog,
-                           feed={"data": data_np},
-                           fetch_list=[loss])
-            assert rets[0] is not None
-
+class TestAdamOpV2(unittest.TestCase):
+    def test_adam_op(self):
+        place = fluid.CPUPlace()
         shape = [2, 3, 8, 8]
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            test_with_place(place, shape)
+        exe = fluid.Executor(place)
+        train_prog = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(train_prog, startup):
+            with fluid.unique_name.guard():
+                data = fluid.data(name="data", shape=shape)
+                conv = fluid.layers.conv2d(data, 8, 3)
+                loss = fluid.layers.reduce_mean(conv)
+
+                beta1 = fluid.layers.create_global_var(
+                    shape=[1], value=0.85, dtype='float32', persistable=True)
+                beta2 = fluid.layers.create_global_var(
+                    shape=[1], value=0.95, dtype='float32', persistable=True)
+                betas = [beta1, beta2]
+                opt = paddle.optimizer.Adam(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8)
+                opt.minimize(loss)
+
+        exe.run(startup)
+        data_np = np.random.random(shape).astype('float32')
+        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
+        assert rets[0] is not None
+
+    def test_adam_op_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = fluid.dygraph.to_variable(value)
+        linear = fluid.Linear(13, 5, dtype="float32")
+
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.01, parameters=linear.parameters())
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adam_op_with_state_dict(self):
+
+        import paddle
+        paddle.disable_static()
+        emb = paddle.nn.Embedding([10, 10])
+
+        adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
+        state_dict = adam.state_dict()
+        adam.set_state_dict(state_dict)
+
+        #learning_rate is _LRScheduler
+        learning_rate = paddle.optimizer.CosineAnnealingLR(
+            learning_rate=0.1, T_max=10)
+        adam = paddle.optimizer.Adam(
+            learning_rate=learning_rate,
+            weight_decay=fluid.regularizer.L2Decay(0.001),
+            parameters=emb.parameters())
+        lr = adam.get_lr()
+        state_dict = adam.state_dict()
+        adam.set_state_dict(state_dict)
+
+        #leanrning_rate is Tensor
+        with self.assertRaises(TypeError):
+            learning_rate = np.array([0.01]).astype("float32")
+            learning_rate = paddle.to_tensor(learning_rate)
+            adam = paddle.optimizer.Adam(
+                learning_rate=learning_rate, parameters=emb.parameters())
+
+        params = adam.get_opti_var_name_list()
+        assert (params is not None)
+
+    def test_adam_with_grad_clip(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = fluid.dygraph.to_variable(value)
+        linear = fluid.Linear(13, 5, dtype="float32")
+        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        adam = paddle.optimizer.Adam(
+            0.1, parameters=linear.parameters(), grad_clip=clip)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adam_op_with_set_lr(self):
+        paddle.disable_static()
+        linear = paddle.nn.Linear(10, 10)
+        adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
+
+        lr = 0.01
+        adam.set_lr(lr)
+        cur_lr = adam.get_lr()
+        assert (lr == cur_lr)
+        with self.assertRaises(TypeError):
+            lr_var = paddle.create_global_var(
+                shape=[1], value=lr, dtype='float32')
+            adam.set_lr(lr_var)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_api.py b/python/paddle/fluid/tests/unittests/test_adamax_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a33e11d2862c037639b1643a2e44ff81a757053
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+
+class TestAdamaxAPI(unittest.TestCase):
+    def test_adamax_api_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_variable(value)
+        linear = paddle.nn.Linear(13, 5)
+        adam = paddle.optimizer.Adamax(
+            learning_rate=0.01,
+            parameters=linear.parameters(),
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adamax_api(self):
+        place = fluid.CPUPlace()
+        shape = [2, 3, 8, 8]
+        exe = fluid.Executor(place)
+        train_prog = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(train_prog, startup):
+            with fluid.unique_name.guard():
+                data = fluid.data(name="data", shape=shape)
+                conv = fluid.layers.conv2d(data, 8, 3)
+                loss = paddle.mean(conv)
+                beta1 = 0.85
+                beta2 = 0.95
+                opt = paddle.optimizer.Adamax(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8)
+                opt.minimize(loss)
+
+        exe.run(startup)
+        data_np = np.random.random(shape).astype('float32')
+        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
+        assert rets[0] is not None
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a7cf54e2e0f15e51ba1b6f7526837f53c7cc2e0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+import paddle.fluid as fluid
+
+
+class TestAdamWOp(unittest.TestCase):
+    def test_adamw_op_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_variable(value)
+        linear = paddle.nn.Linear(13, 5)
+        adam = paddle.optimizer.AdamW(
+            learning_rate=0.01,
+            parameters=linear.parameters(),
+            apply_decay_param_fun=lambda name: True,
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adamw_op_coverage(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_variable(value)
+        linear = paddle.nn.Linear(13, 5)
+        adam = paddle.optimizer.AdamW(
+            learning_rate=0.0,
+            parameters=linear.parameters(),
+            apply_decay_param_fun=lambda name: True,
+            weight_decay=0.01)
+        assert (adam.__str__() is not None)
+
+    def test_adamw_op(self):
+        place = fluid.CPUPlace()
+        shape = [2, 3, 8, 8]
+        exe = fluid.Executor(place)
+        train_prog = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(train_prog, startup):
+            with fluid.unique_name.guard():
+                data = fluid.data(name="data", shape=shape)
+                conv = fluid.layers.conv2d(data, 8, 3)
+                loss = paddle.mean(conv)
+
+                beta1 = fluid.layers.create_global_var(
+                    shape=[1], value=0.85, dtype='float32', persistable=True)
+                beta2 = fluid.layers.create_global_var(
+                    shape=[1], value=0.95, dtype='float32', persistable=True)
+                betas = [beta1, beta2]
+                opt = paddle.optimizer.AdamW(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8)
+                opt.minimize(loss)
+
+        exe.run(startup)
+        data_np = np.random.random(shape).astype('float32')
+        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
+        assert rets[0] is not None
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..55c30e3d2ade0725e6debcdd0a69ca4eee622aec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
@@ -0,0 +1,274 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
+                            pool_type="avg"):
+
+    N = x.shape[0]
+    C, H, W = [x.shape[1], x.shape[2], x.shape[3]] if data_format == 'NCHW' \
+        else [x.shape[3], x.shape[1], x.shape[2]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        output_size = [H_out, W_out]
+    else:
+        H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = H
+        H_out = H
+    if output_size[1] == None:
+        output_size[1] = W
+        W_out = W
+
+    out = np.zeros((N, C, H_out, W_out)) if data_format=='NCHW' \
+        else np.zeros((N, H_out, W_out, C))
+
+    for i in range(H_out):
+        in_h_start = adaptive_start_index(i, H, output_size[0])
+        in_h_end = adaptive_end_index(i, H, output_size[0])
+
+        for j in range(W_out):
+            in_w_start = adaptive_start_index(j, W, output_size[1])
+            in_w_end = adaptive_end_index(j, W, output_size[1])
+
+            if data_format == 'NCHW':
+                x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
+                elif pool_type == 'max':
+                    out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+            elif data_format == 'NHWC':
+                x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
+                elif pool_type == 'max':
+                    out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
+    return out
+
+
+class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="avg")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="avg")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="avg")
+
+        self.res_4_np = adaptive_pool2d_forward(
+            x=self.x_np,
+            output_size=[3, 3],
+            pool_type="avg",
+            data_format="NHWC")
+
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="avg")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_avg_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[2, 5])
+
+            out_4 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[None, 3])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_4, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_avg_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[2, 5])
+
+            out_4 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[None, 3])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="avg")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="avg")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="avg")
+
+        self.res_4_np = adaptive_pool2d_forward(
+            x=self.x_np,
+            output_size=[3, 3],
+            pool_type="avg",
+            data_format="NHWC")
+
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="avg")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
+            out_1 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=5)
+            out_2 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[2, 5])
+            out_3 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+                output_size=[3, 3], data_format="NHWC")
+            out_4 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_avg_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_4, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
+            out_1 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=5)
+            out_2 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[2, 5])
+            out_3 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+                output_size=[3, 3], data_format="NHWC")
+            out_4 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_avg_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
new file mode 100755
index 0000000000000000000000000000000000000000..c04ee660667edaff01d7029e83b912c05429a15f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
@@ -0,0 +1,293 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool3d_forward(x,
+                            output_size,
+                            adaptive=True,
+                            data_format='NCDHW',
+                            pool_type='avg'):
+
+    N = x.shape[0]
+    C, D, H, W = [x.shape[1], x.shape[2], x.shape[3], x.shape[4]] \
+        if data_format == 'NCDHW' else [x.shape[4], x.shape[1], x.shape[2],x.shape[3]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        D_out = output_size
+        output_size = [D_out, H_out, W_out]
+    else:
+        D_out, H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = D
+        D_out = D
+    if output_size[1] == None:
+        output_size[1] = H
+        H_out = H
+    if output_size[2] == None:
+        output_size[2] = W
+        W_out = W
+
+    out = np.zeros((N, C, D_out, H_out, W_out)) if data_format=='NCDHW' \
+        else np.zeros((N, D_out, H_out, W_out, C))
+    for k in range(D_out):
+        d_start = adaptive_start_index(k, D, output_size[0])
+        d_end = adaptive_end_index(k, D, output_size[0])
+
+        for i in range(H_out):
+            h_start = adaptive_start_index(i, H, output_size[1])
+            h_end = adaptive_end_index(i, H, output_size[1])
+
+            for j in range(W_out):
+                w_start = adaptive_start_index(j, W, output_size[2])
+                w_end = adaptive_end_index(j, W, output_size[2])
+
+                if data_format == 'NCDHW':
+                    x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
+                                 w_end]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, :, k, i, j] = np.sum(x_masked,
+                                                    axis=(2, 3, 4)) / field_size
+                    elif pool_type == 'max':
+                        out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+
+                elif data_format == 'NDHWC':
+                    x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
+                                 w_end, :]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, k, i, j, :] = np.sum(x_masked,
+                                                    axis=(1, 2, 3)) / field_size
+                    elif pool_type == 'max':
+                        out[:, k, i, j, :] = np.max(x_masked, axis=(1, 2, 3))
+    return out
+
+
+class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="avg")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="avg")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="avg")
+
+        self.res_4_np = adaptive_pool3d_forward(
+            x=self.x_np,
+            output_size=[3, 3, 3],
+            pool_type="avg",
+            data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="avg")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_avg_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            out_4 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_4, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_avg_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            out_4 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="avg")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="avg")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="avg")
+
+        self.res_4_np = adaptive_pool3d_forward(
+            x=self.x_np,
+            output_size=[3, 3, 3],
+            pool_type="avg",
+            data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="avg")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=5)
+            out_2 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[3, 3, 3], data_format="NDHWC")
+            out_4 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_avg_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_4, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=5)
+            out_2 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[3, 3, 3], data_format="NDHWC")
+            out_4 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_avg_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
index c524fb6930d97c0eb2971d09e751a54628d41325..6157314b1f060577a7d058f0de9a42f6368947ff 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
@@ -63,7 +63,7 @@ class TestAffineChannelOp(OpTest):
         self.check_grad(['X'], 'Out', no_grad_set=set(['Scale', 'Bias']))
 
     def init_test_case(self):
-        self.shape = [2, 100, 12, 12]
+        self.shape = [2, 100, 3, 3]
         self.C = 100
         self.layout = 'NCHW'
 
@@ -102,7 +102,7 @@ class TestAffineChannelOpError(unittest.TestCase):
 
 class TestAffineChannelNHWC(TestAffineChannelOp):
     def init_test_case(self):
-        self.shape = [2, 12, 12, 100]
+        self.shape = [2, 3, 3, 100]
         self.C = 100
         self.layout = 'NHWC'
 
@@ -115,7 +115,7 @@ class TestAffineChannelNHWC(TestAffineChannelOp):
 
 class TestAffineChannel2D(TestAffineChannelOp):
     def init_test_case(self):
-        self.shape = [8, 100]
+        self.shape = [2, 100]
         self.C = 100
         self.layout = 'NCHW'
 
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_function.py b/python/paddle/fluid/tests/unittests/test_affine_grid_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..c874cf197ea88c7f12b9b24223d40d22be268b10
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_function.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import paddle.fluid.initializer as I
+import unittest
+
+
+class AffineGridTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 theta_shape=(20, 2, 3),
+                 output_shape=[20, 2, 5, 7],
+                 align_corners=True,
+                 dtype="float32",
+                 invalid_theta=False,
+                 variable_output_shape=False):
+        super(AffineGridTestCase, self).__init__(methodName)
+
+        self.theta_shape = theta_shape
+        self.output_shape = output_shape
+        self.align_corners = align_corners
+        self.dtype = dtype
+        self.invalid_theta = invalid_theta
+        self.variable_output_shape = variable_output_shape
+
+    def setUp(self):
+        self.theta = np.random.randn(*(self.theta_shape)).astype(self.dtype)
+
+    def fluid_layer(self, place):
+        # align_corners = True
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                theta_var = fluid.data(
+                    "input", self.theta_shape, dtype=self.dtype)
+                y_var = fluid.layers.affine_grid(theta_var, self.output_shape)
+        feed_dict = {"input": self.theta}
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                theta_var = fluid.data(
+                    "input", self.theta_shape, dtype=self.dtype)
+                y_var = F.affine_grid(
+                    theta_var,
+                    self.output_shape,
+                    align_corners=self.align_corners)
+        feed_dict = {"input": self.theta}
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def paddle_dygraph_layer(self):
+        theta_var = dg.to_variable(
+            self.theta) if not self.invalid_theta else "invalid"
+        output_shape = dg.to_variable(
+            self.
+            output_shape) if self.variable_output_shape else self.output_shape
+        y_var = F.affine_grid(
+            theta_var, output_shape, align_corners=self.align_corners)
+        y_np = y_var.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        place = fluid.CPUPlace()
+        result1 = self.fluid_layer(place)
+        result2 = self.functional(place)
+        with dg.guard(place):
+            result3 = self.paddle_dygraph_layer()
+        if self.align_corners:
+            np.testing.assert_array_almost_equal(result1, result2)
+        np.testing.assert_array_almost_equal(result2, result3)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+class AffineGridErrorTestCase(AffineGridTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(ValueError):
+                self.paddle_dygraph_layer()
+
+
+def add_cases(suite):
+    suite.addTest(AffineGridTestCase(methodName='runTest'))
+    suite.addTest(AffineGridTestCase(methodName='runTest', align_corners=True))
+
+    suite.addTest(AffineGridTestCase(methodName='runTest', align_corners=False))
+    suite.addTest(
+        AffineGridTestCase(
+            methodName='runTest', variable_output_shape=True))
+
+    suite.addTest(
+        AffineGridTestCase(
+            methodName='runTest',
+            theta_shape=(20, 2, 3),
+            output_shape=[20, 1, 7, 7],
+            align_corners=True))
+
+
+def add_error_cases(suite):
+    suite.addTest(
+        AffineGridErrorTestCase(
+            methodName='runTest', output_shape="not_valid"))
+    suite.addTest(
+        AffineGridErrorTestCase(
+            methodName='runTest',
+            invalid_theta=True))  # to test theta not variable error checking
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
index 3668c4f4aa174e34dcc96d40ddae7b359c1bee18..55612d71a17a7ae9801535bf5a35c83b100aab30 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
@@ -17,14 +17,20 @@ import numpy as np
 from op_test import OpTest
 
 
-def AffineGrid(theta, size):
+def AffineGrid(theta, size, align_corners):
     n = size[0]
     w = size[3]
     h = size[2]
+    h_factor = w_factor = 1
+    if not align_corners:
+        h_factor = (h - 1) / float(h)
+        w_factor = (w - 1) / float(w)
     h_idx = np.repeat(
-        np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
+        np.linspace(-1, 1, h)[np.newaxis, :], w,
+        axis=0).T[:, :, np.newaxis] * h_factor
     w_idx = np.repeat(
-        np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
+        np.linspace(-1, 1, w)[np.newaxis, :], h,
+        axis=0)[:, :, np.newaxis] * w_factor
     grid = np.concatenate(
         [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
     grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
@@ -45,12 +51,17 @@ class TestAffineGridOp(OpTest):
         theta = np.random.randint(1, 3, self.theta_shape).astype("float32")
         theta = np.ones(self.theta_shape).astype("float32")
         self.inputs = {'Theta': theta}
-        self.attrs = {"use_cudnn": True}
+        self.attrs = {
+            "use_cudnn": self.use_cudnn,
+            "align_corners": self.align_corners
+        }
         if self.dynamic_shape:
             self.inputs['OutputShape'] = self.output_shape
         else:
             self.attrs['output_shape'] = self.output_shape
-        self.outputs = {'Output': AffineGrid(theta, self.output_shape)}
+        self.outputs = {
+            'Output': AffineGrid(theta, self.output_shape, self.align_corners)
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -62,6 +73,8 @@ class TestAffineGridOp(OpTest):
         self.theta_shape = (17, 2, 3)
         self.output_shape = np.array([17, 2, 5, 7]).astype("int32")
         self.dynamic_shape = False
+        self.use_cudnn = False
+        self.align_corners = True
 
 
 class TestAffineGridOpCase1(TestAffineGridOp):
@@ -69,6 +82,35 @@ class TestAffineGridOpCase1(TestAffineGridOp):
         self.theta_shape = (20, 2, 3)
         self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
         self.dynamic_shape = True
+        self.use_cudnn = True
+        self.align_corners = True
+
+
+class TestAffineGridOpCase2(TestAffineGridOp):
+    def initTestCase(self):
+        self.theta_shape = (20, 2, 3)
+        self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
+        self.dynamic_shape = True
+        self.use_cudnn = False
+        self.align_corners = True
+
+
+class TestAffineGridOpCase3(TestAffineGridOp):
+    def initTestCase(self):
+        self.theta_shape = (20, 2, 3)
+        self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
+        self.dynamic_shape = True
+        self.use_cudnn = False
+        self.align_corners = False
+
+
+class TestAffineGridOpCase4(TestAffineGridOp):
+    def initTestCase(self):
+        self.theta_shape = (25, 2, 3)
+        self.output_shape = np.array([25, 2, 5, 6]).astype("int32")
+        self.dynamic_shape = False
+        self.use_cudnn = False
+        self.align_corners = False
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_allclose_op.py b/python/paddle/fluid/tests/unittests/test_allclose_op.py
index 5b5ed2641880ade671434185414fa45c26901a2d..dc50e569f80433a5730b1ea33a6f3b4922d99c91 100644
--- a/python/paddle/fluid/tests/unittests/test_allclose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_allclose_op.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 
 
 class TestAllcloseOp(OpTest):
@@ -76,5 +77,58 @@ class TestAllcloseOpNanTrue(TestAllcloseOp):
         self.equal_nan = True
 
 
+class TestAllcloseDygraph(unittest.TestCase):
+    def test_api_case(self):
+        paddle.disable_static()
+        x_data = np.random.rand(10, 10)
+        y_data = np.random.rand(10, 10)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        out = paddle.allclose(x, y, rtol=1e-05, atol=1e-08)
+        expected_out = np.allclose(x_data, y_data, rtol=1e-05, atol=1e-08)
+        self.assertTrue((out.numpy() == expected_out).all(), True)
+        paddle.enable_static()
+
+
+class TestAllcloseError(unittest.TestCase):
+    def test_input_dtype(self):
+        def test_x_dtype():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float16')
+                y = paddle.data(name='y', shape=[10, 10], dtype='float64')
+                result = paddle.allclose(x, y)
+
+        self.assertRaises(TypeError, test_x_dtype)
+
+        def test_y_dtype():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+                y = paddle.data(name='y', shape=[10, 10], dtype='int32')
+                result = paddle.allclose(x, y)
+
+        self.assertRaises(TypeError, test_y_dtype)
+
+    def test_attr(self):
+        x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+        y = paddle.data(name='y', shape=[10, 10], dtype='float64')
+
+        def test_rtol():
+            result = paddle.allclose(x, y, rtol=True)
+
+        self.assertRaises(TypeError, test_rtol)
+
+        def test_atol():
+            result = paddle.allclose(x, y, rtol=True)
+
+        self.assertRaises(TypeError, test_atol)
+
+        def test_equal_nan():
+            result = paddle.allclose(x, y, equal_nan=1)
+
+        self.assertRaises(TypeError, test_equal_nan)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
index 0201f0635a5afeb285cdbca3e8d526a1ff5032f2..3639c4dea0a3a12aa46d2875affeebd4c623a4dd 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
@@ -201,107 +201,5 @@ class BaseTestComplex2_2(OpTest):
             }
 
 
-class APT_ArgMaxTest(unittest.TestCase):
-    def test_output_result(self):
-        with fluid.program_guard(fluid.Program()):
-            data1 = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            data2 = fluid.data(name="Y", shape=[3], dtype="int64")
-            out = paddle.argmax(input=data1, out=data2)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result = exe.run(
-                feed={"X": np.random.rand(3, 4).astype("float32")},
-                fetch_list=[data2, out])
-            self.assertEqual((result[0] == result[1]).all(), True)
-
-    def test_basic(self):
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            out = paddle.argmax(input=data)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            np_input = np.random.rand(3, 4).astype("float32")
-            expected_result = np.argmax(np_input, axis=1)
-
-            result, = exe.run(feed={"X": np_input}, fetch_list=[out])
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            out = paddle.argmax(input=data, axis=0)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            np_input = np.random.rand(3, 4).astype("float32")
-            expected_result = np.argmax(np_input, axis=0)
-
-            result = exe.run(feed={"X": np_input}, fetch_list=[out])
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            out = paddle.argmax(input=data, dtype="int32")
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            np_input = np.random.rand(3, 4).astype("float32")
-            expected_result = np.argmax(np_input, axis=1).astype(np.int32)
-
-            result = exe.run(feed={"X": np_input}, fetch_list=[out])
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data1 = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            data2 = fluid.data(name="Y", shape=[3], dtype="int64")
-            out = paddle.argmax(input=data, out=data2)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result = exe.run(
-                feed={"X": np.random.rand(3, 4).astype("float32")},
-                fetch_list=[data2, out])
-        self.assertEqual((result[0] == result[1]).all(), True)
-
-    def test_name(self):
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data(name="x", shape=[100], dtype="float32")
-            y_1 = paddle.argmax(x, name='arg_max_res')
-            self.assertEqual(('arg_max_res' in y_1.name), True)
-
-    def test_errors(self):
-        def test_dtype1():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                paddle.argmax(data, dtype="float32")
-
-        self.assertRaises(TypeError, test_dtype1)
-
-        def test_dtype2():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float64")
-                paddle.argmax(data, dtype="float32")
-
-        self.assertRaises(TypeError, test_dtype2)
-
-
-class TestArgMinMaxOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-
-            def test_argmax_x_type():
-                x1 = [1, 2, 3]
-                output = fluid.layers.argmax(x=x1)
-
-            self.assertRaises(TypeError, test_argmax_x_type)
-
-            def test_argmin_x_type():
-                x2 = [1, 2, 3]
-                output = fluid.layers.argmin(x=x2)
-
-            self.assertRaises(TypeError, test_argmin_x_type)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c1f9d802c31ac2c3b244541936ba25018e1487a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
@@ -0,0 +1,313 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+
+
+def create_kernel_case(op_type, numpy_op_type):
+    class ArgMinMaxKernelBaseCase(OpTest):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = 0
+
+        def setUp(self):
+            np.random.seed(123)
+            self.initTestCase()
+            self.dims = (4, 5, 6)
+            self.dtype = "float64"
+            self.x = (1000 * np.random.random(self.dims).astype(self.dtype))
+            self.inputs = {'X': self.x}
+            self.attrs = {"axis": self.axis}
+            self.numpy_op = eval("np.%s" % (numpy_op_type))
+            self.outputs = {'Out': self.numpy_op(self.x, axis=self.axis)}
+
+        def test_check_output(self):
+            paddle.enable_static()
+            self.check_output()
+
+    class ArgMinMaxKernelCase0(ArgMinMaxKernelBaseCase):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = 1
+
+    class ArgMinMaxKernelCase1(ArgMinMaxKernelBaseCase):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = 2
+
+    class ArgMinMaxKernelCase2(ArgMinMaxKernelBaseCase):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = -1
+
+    class ArgMinMaxKernelCase3(ArgMinMaxKernelBaseCase):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = -2
+
+    class ArgMinMaxKernelCase4(ArgMinMaxKernelBaseCase):
+        def setUp(self):
+            self.initTestCase()
+            self.dims = (4, 5, 6)
+            self.dtype = "float64"
+            self.x = (1000 * np.random.random(self.dims).astype(self.dtype))
+            self.inputs = {'X': self.x}
+            self.attrs = {"axis": self.axis, "keepdims": True}
+            self.numpy_op = eval("np.%s" % (numpy_op_type))
+            self.outputs = {
+                'Out': self.numpy_op(
+                    self.x, axis=self.axis).reshape((1, 5, 6))
+            }
+
+    class ArgMinMaxKernelCase5(ArgMinMaxKernelBaseCase):
+        def setUp(self):
+            self.initTestCase()
+            self.dims = (4)
+            self.dtype = "float64"
+            self.x = (1000 * np.random.random(self.dims).astype(self.dtype))
+            self.inputs = {'X': self.x}
+            self.attrs = {"axis": self.axis, "flatten": True}
+            self.numpy_op = eval("np.%s" % (numpy_op_type))
+            self.outputs = {
+                'Out': self.numpy_op(
+                    self.x.flatten(), axis=self.axis)
+            }
+
+    class ArgMinMaxKernelCase6(ArgMinMaxKernelBaseCase):
+        def setUp(self):
+            self.initTestCase()
+            self.dims = (4)
+            self.dtype = "float64"
+            self.x = (1000 * np.random.random(self.dims).astype(self.dtype))
+            self.inputs = {'X': self.x}
+            self.attrs = {"axis": self.axis, "flatten": True, "keepdims": True}
+            self.numpy_op = eval("np.%s" % (numpy_op_type))
+            self.outputs = {
+                'Out':
+                np.array(self.numpy_op(
+                    self.x.flatten(), axis=self.axis))
+            }
+
+    cls_name = "ArgMinMaxKernelBaseCase_%s" % (op_type)
+    ArgMinMaxKernelBaseCase.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelBaseCase
+
+    cls_name = "ArgMinMaxKernelCase0_%s" % (op_type)
+    ArgMinMaxKernelCase0.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase0
+
+    cls_name = "ArgMinMaxKernelCase1_%s" % (op_type)
+    ArgMinMaxKernelCase1.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase1
+
+    cls_name = "ArgMinMaxKernelCase2_%s" % (op_type)
+    ArgMinMaxKernelCase2.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase2
+
+    cls_name = "ArgMinMaxKernelCase3_%s" % (op_type)
+    ArgMinMaxKernelCase3.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase3
+
+    cls_name = "ArgMinMaxKernelCase4_%s" % (op_type)
+    ArgMinMaxKernelCase4.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase4
+
+    cls_name = "ArgMinMaxKernelCase5_%s" % (op_type)
+    ArgMinMaxKernelCase5.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase5
+
+    cls_name = "ArgMinMaxKernelCase6_%s" % (op_type)
+    ArgMinMaxKernelCase6.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase6
+
+
+for op_type, numpy_op_type in zip(['arg_max', 'arg_min'], ['argmax', 'argmin']):
+    create_kernel_case(op_type, numpy_op_type)
+
+
+def create_test_case(op_type):
+    class ArgMaxMinTestCase(unittest.TestCase):
+        def setUp(self):
+            np.random.seed(123)
+            self.input_data = np.random.rand(10, 10).astype("float32")
+            self.places = []
+            self.places.append(fluid.CPUPlace())
+            if core.is_compiled_with_cuda():
+                self.places.append(paddle.CUDAPlace(0))
+            self.op = eval("paddle.%s" % (op_type))
+            self.numpy_op = eval("np.%s" % (op_type))
+
+        def run_static(self, place):
+            paddle.enable_static()
+            with paddle.static.program_guard(paddle.static.Program()):
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+                op = eval("paddle.%s" % (op_type))
+                result = op(data_var)
+                exe = paddle.static.Executor(place)
+                result_data = exe.run(feed={"data": self.input_data},
+                                      fetch_list=[result])
+                expected_data = self.numpy_op(self.input_data)
+                self.assertTrue((result_data == np.array(expected_data)).all(),
+                                True)
+
+            with paddle.static.program_guard(paddle.static.Program()):
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+                op = eval("paddle.%s" % (op_type))
+                result = op(data_var, axis=1)
+                exe = paddle.static.Executor(place)
+                result_data = exe.run(feed={"data": self.input_data},
+                                      fetch_list=[result])
+                expected_data = self.numpy_op(self.input_data, axis=1)
+                self.assertTrue((result_data == expected_data).all(), True)
+
+            with paddle.static.program_guard(paddle.static.Program()):
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+                op = eval("paddle.%s" % (op_type))
+                result = op(data_var, axis=-1)
+                exe = paddle.static.Executor(place)
+                result_data = exe.run(feed={"data": self.input_data},
+                                      fetch_list=[result])
+                expected_data = self.numpy_op(self.input_data, axis=-1)
+                self.assertTrue((result_data == expected_data).all(), True)
+
+            with paddle.static.program_guard(paddle.static.Program()):
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+
+                op = eval("paddle.%s" % (op_type))
+                result = op(data_var, axis=-1, keepdim=True)
+                exe = paddle.static.Executor(place)
+                result_data = exe.run(feed={"data": self.input_data},
+                                      fetch_list=[result])
+                expected_data = self.numpy_op(
+                    self.input_data, axis=-1).reshape((10, 1))
+                self.assertTrue((result_data == expected_data).all(), True)
+
+            with paddle.static.program_guard(paddle.static.Program()):
+                op = eval("paddle.%s" % (op_type))
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+                result = op(data_var, axis=-1, name="test_arg_api")
+                self.assertTrue("test_arg_api" in result.name)
+
+        def run_dygraph(self, place):
+            paddle.disable_static()
+            op = eval("paddle.%s" % (op_type))
+            data_tensor = paddle.to_tensor(self.input_data)
+
+            #case 1 
+            result_data = op(data_tensor)
+            excepted_data = self.numpy_op(self.input_data)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            #case 2 
+            result_data = op(data_tensor, axis=1)
+            excepted_data = self.numpy_op(self.input_data, axis=1)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            #case 3 
+            result_data = op(data_tensor, axis=-1)
+            excepted_data = self.numpy_op(self.input_data, axis=-1)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            #case 4 
+            result_data = op(data_tensor, axis=-1, keepdim=True)
+            excepted_data = self.numpy_op(self.input_data, axis=-1)
+            excepted_data = excepted_data.reshape((10))
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            #case 5 
+            result_data = op(data_tensor, axis=-1, keepdim=True, dtype="int32")
+            self.assertTrue(result_data.numpy().dtype == np.int32)
+
+            # case for dim 4, 5, 6, for test case coverage
+            input_data = np.random.rand(5, 5, 5, 5)
+            excepted_data = self.numpy_op(input_data, axis=0)
+            result_data = op(paddle.to_tensor(input_data), axis=0)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            input_data = np.random.rand(4, 4, 4, 4, 4)
+            excepted_data = self.numpy_op(input_data, axis=0)
+            result_data = op(paddle.to_tensor(input_data), axis=0)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            input_data = np.random.rand(3, 3, 3, 3, 3, 3)
+            excepted_data = self.numpy_op(input_data, axis=0)
+            result_data = op(paddle.to_tensor(input_data), axis=0)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+        def test_case(self):
+            for place in self.places:
+                self.run_static(place)
+                self.run_dygraph(place)
+
+    cls_name = "ArgMaxMinTestCase_{}".format(op_type)
+    ArgMaxMinTestCase.__name__ = cls_name
+    globals()[cls_name] = ArgMaxMinTestCase
+
+
+for op_type in ['argmin', 'argmax']:
+    create_test_case(op_type)
+
+
+class TestArgMinMaxOpError(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+
+            def test_argmax_x_type():
+                x1 = [1, 2, 3]
+                output = paddle.argmax(x=x1)
+
+            self.assertRaises(TypeError, test_argmax_x_type)
+
+            def test_argmin_x_type():
+                x2 = [1, 2, 3]
+                output = paddle.argmin(x=x2)
+
+            self.assertRaises(TypeError, test_argmin_x_type)
+
+            def test_argmax_attr_type():
+                data = paddle.static.data(
+                    name="test_argmax", shape=[10], dtype="float32")
+                output = paddle.argmax(x=data, dtype="float32")
+
+            self.assertRaises(ValueError, test_argmax_attr_type)
+
+            def test_argmin_attr_type():
+                data = paddle.static.data(
+                    name="test_argmax", shape=[10], dtype="float32")
+                output = paddle.argmin(x=data, dtype="float32")
+
+            self.assertRaises(ValueError, test_argmin_attr_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
index 729fe20c8f87ed1cd07b7f5c2784a79acd1fa54b..fd009db5fd00133c5bad7c8c52662002ebd03fa8 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
@@ -59,6 +59,10 @@ class AutoCheckPointACLBase(AutoCheckpointBase):
         os.environ.clear()
         os.environ.update(self._old_environ)
 
+        file_name = os.path.basename(__file__)
+        base_name = os.path.splitext(file_name)[0]
+        print("runnng name:", base_name)
+
     def _run_normal(self):
         exe, main_prog, startup_prog = self._generate()
 
@@ -182,6 +186,20 @@ class AutoCheckPointACLBase(AutoCheckpointBase):
         fs.delete(save_dir)
         logger.info("begin _run_load_0")
 
+    def _test_corner_epoch_no(self, break_epoch_no):
+        logger.info("begin test_corener_epoch_no")
+        checker = acp._get_checker()
+        fs = HDFSClient(checker.hdfs_home, None)
+
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._reset_generator()
+        self._run_save_0(break_epoch_no=break_epoch_no)
+        self._reset_generator()
+        self._run_load_0(break_epoch_no=break_epoch_no)
+
+        fs.delete(checker.hdfs_checkpoint_path)
+        logger.info("end test_corener_epoch_no")
+
 
 class AutoCheckpointTest(AutoCheckPointACLBase):
     def setUp(self):
@@ -193,13 +211,13 @@ class AutoCheckpointTest(AutoCheckPointACLBase):
             "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
             "PADDLE_TRAINER_ID": "0",
             "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
-            "PADDLE_JOB_ID": "test_job_auto_1",
+            "PADDLE_JOB_ID": "test_job_auto_0",
             "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
             "PADDLE_EDL_HDFS_NAME": "",
             "PADDLE_EDL_HDFS_UGI": "",
-            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_1",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_0",
             "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
-            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_0",
             "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
         }
         os.environ.update(proc_env)
@@ -246,102 +264,6 @@ class AutoCheckpointTest(AutoCheckPointACLBase):
 
         logger.info("end test_not_use")
 
-    def test_multiple(self):
-        checker = acp._get_checker()
-        fs = HDFSClient(checker.hdfs_home, None)
-        fs.delete(checker.hdfs_checkpoint_path)
-        self._reset_generator()
-
-        logger.info("begin test_multiple")
-        fs = LocalFS()
-        save_dir = "./run_save_0"
-        fs.delete(save_dir)
-
-        exe, main_prog1, startup_prog1 = self._generate()
-        _, main_prog2, startup_prog2 = self._generate()
-
-        compiled1, data_loader1, optimizer1, loss1, image1, label1 = \
-            self._init_env(exe, main_prog1, startup_prog1)
-
-        compiled2, data_loader2, optimizer2, loss2, image2, label2 = \
-            self._init_env(exe, main_prog2, startup_prog2)
-
-        o = None
-        epochs = []
-        for i in acp.train_epoch_range(3, 0):
-            for data in data_loader1():
-                fetch = exe.run(compiled1, feed=data, fetch_list=[loss1])
-
-            for data in data_loader2():
-                fetch = exe.run(compiled2, feed=data, fetch_list=[loss2])
-
-            o = acp._get_train_epoch_range()
-            self.assertEqual(len(o._exe_status), 2)
-            print(o._exe_status)
-            epochs.append(i)
-
-        o = acp._get_train_epoch_range()
-        self.assertTrue(o == None, "now train epoch must not exits now")
-        self.assertEqual(i, 2)
-        self.assertEqual(epochs, [0, 1, 2])
-
-        fs.delete(save_dir)
-        logger.info("end test_multiple")
-
-    def test_distributed_basic(self):
-        checker = acp._get_checker()
-        fs = HDFSClient(checker.hdfs_home, None)
-        fs.delete(checker.hdfs_checkpoint_path)
-        self._reset_generator()
-
-        logger.info("begin test_distributed_basic")
-        fs = LocalFS()
-        save_dir = "./run_save_0"
-        fs.delete(save_dir)
-
-        #basic
-        exe, main_prog, startup_prog = self._generate()
-
-        compiled, data_loader, optimizer, loss, image, label = \
-            self._init_env(exe, main_prog, startup_prog, minimize=False)
-
-        #fleet
-        os.environ["TRAINING_ROLE"] = "TRAINER"
-        os.environ["PADDLE_TRAINER_ID"] = "0"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
-
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-
-        with fluid.program_guard(main_prog, startup_prog):
-            dist_optimizer = fleet.distributed_optimizer(optimizer)
-            dist_optimizer.minimize(loss)
-
-        exe.run(startup_prog)
-
-        o = None
-        i = 0
-        name = None
-        for i in acp.train_epoch_range(3, 0):
-            o = acp._get_train_epoch_range()
-            name = o.name
-            logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i))
-
-            for data in data_loader():
-                fetch = exe.run(fleet.main_program,
-                                feed=data,
-                                fetch_list=[loss])
-
-            self.assertEqual(len(o._exe_status), 1)
-
-        o = acp._get_train_epoch_range()
-        assert o == None, "now train epoch must not exits now"
-        self.assertEqual(i, 2)
-
-        fs.delete(save_dir)
-
-        logger.info("end test_distributed_basic")
-
     def test_checker(self):
         os.environ.pop("PADDLE_JOB_ID", None)
         try:
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
new file mode 100644
index 0000000000000000000000000000000000000000..55173325f621f7333a7c3ca32a9c55becee72e5a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTest1(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_1",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_1",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_1",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_corner_epoch_no(self):
+        self._test_corner_epoch_no(0)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
index 30a743510537e1fa0e2aeedb18de25c7e1fd120c..5d72fa01008af55a83d7b9a19747a8d96fb74b2b 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
@@ -57,19 +57,7 @@ class AutoCheckpointTest2(AutoCheckPointACLBase):
         os.environ.update(proc_env)
 
     def test_corner_epoch_no(self):
-        logger.info("begin test_corener_epoch_no")
-        checker = acp._get_checker()
-        fs = HDFSClient(checker.hdfs_home, None)
-
-        for i in range(3):
-            fs.delete(checker.hdfs_checkpoint_path)
-            self._reset_generator()
-            self._run_save_0(break_epoch_no=i)
-            self._reset_generator()
-            self._run_load_0(break_epoch_no=i)
-
-        fs.delete(checker.hdfs_checkpoint_path)
-        logger.info("end test_corener_epoch_no")
+        self._test_corner_epoch_no(1)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
new file mode 100644
index 0000000000000000000000000000000000000000..5382f7e328ed1afa2d7516cd0d8db2db659aadd7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTest3(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_3",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_3",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_3",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_corner_epoch_no(self):
+        self._test_corner_epoch_no(2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
new file mode 100644
index 0000000000000000000000000000000000000000..90db9595d92ef602c03fa7dd104484a4f6101a87
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTestDist(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_dist_basic",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_dist_basic",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_dist_basic",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_distributed_basic(self):
+        checker = acp._get_checker()
+        fs = HDFSClient(checker.hdfs_home, None)
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._reset_generator()
+
+        logger.info("begin test_distributed_basic")
+        fs = LocalFS()
+        save_dir = "./run_save_0"
+        fs.delete(save_dir)
+
+        #basic
+        exe, main_prog, startup_prog = self._generate()
+
+        compiled, data_loader, optimizer, loss, image, label = \
+            self._init_env(exe, main_prog, startup_prog, minimize=False)
+
+        #fleet
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
+
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+
+        with fluid.program_guard(main_prog, startup_prog):
+            dist_optimizer = fleet.distributed_optimizer(optimizer)
+            dist_optimizer.minimize(loss)
+
+        exe.run(startup_prog)
+
+        o = None
+        i = 0
+        name = None
+        for i in acp.train_epoch_range(3, 0):
+            o = acp._get_train_epoch_range()
+            name = o.name
+            logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i))
+
+            for data in data_loader():
+                fetch = exe.run(fleet.main_program,
+                                feed=data,
+                                fetch_list=[loss])
+
+            self.assertEqual(len(o._exe_status), 1)
+
+        o = acp._get_train_epoch_range()
+        assert o == None, "now train epoch must not exits now"
+        self.assertEqual(i, 2)
+
+        fs.delete(save_dir)
+
+        logger.info("end test_distributed_basic")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c10cd0e9922859bf3bad2015587fc0a6b2ba5da
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTestMul(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_dist_multiple",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_dist_multiple",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_dist_multiple",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_multiple(self):
+        checker = acp._get_checker()
+        fs = HDFSClient(checker.hdfs_home, None)
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._reset_generator()
+
+        logger.info("begin test_multiple")
+        fs = LocalFS()
+        save_dir = "./run_save_0"
+        fs.delete(save_dir)
+
+        exe, main_prog1, startup_prog1 = self._generate()
+        _, main_prog2, startup_prog2 = self._generate()
+
+        compiled1, data_loader1, optimizer1, loss1, image1, label1 = \
+            self._init_env(exe, main_prog1, startup_prog1)
+
+        compiled2, data_loader2, optimizer2, loss2, image2, label2 = \
+            self._init_env(exe, main_prog2, startup_prog2)
+
+        o = None
+        epochs = []
+        for i in acp.train_epoch_range(3, 0):
+            for data in data_loader1():
+                fetch = exe.run(compiled1, feed=data, fetch_list=[loss1])
+
+            for data in data_loader2():
+                fetch = exe.run(compiled2, feed=data, fetch_list=[loss2])
+
+            o = acp._get_train_epoch_range()
+            self.assertEqual(len(o._exe_status), 2)
+            print(o._exe_status)
+            epochs.append(i)
+
+        o = acp._get_train_epoch_range()
+        self.assertTrue(o == None, "now train epoch must not exits now")
+        self.assertEqual(i, 2)
+        self.assertEqual(epochs, [0, 1, 2])
+
+        fs.delete(save_dir)
+        logger.info("end test_multiple")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index bc666c0de5be06be7529bced39071303430c8ace..875f6211a7fbd98463d98dff91d93cc1b431fc86 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -86,6 +86,31 @@ class TestBaseLayer(unittest.TestCase):
             ret = l()
             self.assertTrue(np.allclose(ret.numpy(), 0.8 * np.ones([2, 2])))
 
+    def test_add_parameter_with_error(self):
+        with fluid.dygraph.guard():
+            net = fluid.Layer()
+            param = net.create_parameter(shape=[1])
+
+            with self.assertRaises(TypeError):
+                net.add_parameter(10, param)
+
+            with self.assertRaises(KeyError):
+                net.add_parameter("param.name", param)
+
+            with self.assertRaises(KeyError):
+                net.add_parameter("", param)
+
+            with self.assertRaises(KeyError):
+                net.test_param = 10
+                net.add_parameter("test_param", param)
+
+            with self.assertRaises(TypeError):
+                net.add_parameter("no_param", 10)
+
+            load_param = net.create_parameter(shape=[1])
+            net._loaddict_holder[load_param.name] = load_param
+            net.add_parameter("load_param", load_param)
+
 
 class BufferLayer(fluid.Layer):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d3c6e7d0492b2f4a98a595f015e3b9f4a19e24
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestBatchNorm(unittest.TestCase):
+    def test_name(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            with fluid.dygraph.guard(p):
+                batch_norm1d = paddle.nn.BatchNorm1d(1, name="test")
+
+    def test_error(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            #paddle.disable_static()
+            x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+            x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+
+            def error1d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm1d = paddle.nn.BatchNorm1d(1)
+                batch_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                batch_norm2d = paddle.nn.BatchNorm2d(1)
+                batch_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm3d = paddle.nn.BatchNorm3d(1)
+                batch_norm3d(fluid.dygraph.to_variable(x_data_4))
+
+            with fluid.dygraph.guard(p):
+                self.assertRaises(ValueError, error1d)
+                self.assertRaises(ValueError, error2d)
+                self.assertRaises(ValueError, error3d)
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np, is_test, trainable_statistics):
+                with program_guard(Program(), Program()):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_sampler.py b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
index 2e2a6144fd0119ef920031a623f7eedd8ae5dbe7..6ec6fdb59f200ce1dc9b6418b7f11329f85ba5dd 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
@@ -88,6 +88,18 @@ class TestRandomSampler(unittest.TestCase):
             rets.append(i)
         assert tuple(sorted(rets)) == tuple(range(0, 60))
 
+    def test_with_generator_num_samples(self):
+        dataset = RandomDataset(100, 10)
+        generator = iter(range(0, 60))
+        sampler = RandomSampler(
+            dataset, generator=generator, num_samples=50, replacement=True)
+        assert len(sampler) == 50
+
+        rets = []
+        for i in iter(sampler):
+            rets.append(i)
+        assert tuple(sorted(rets)) == tuple(range(0, 50))
+
 
 class TestBatchSampler(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_bce_loss.py b/python/paddle/fluid/tests/unittests/test_bce_loss.py
index 21571e0981065a0a1e2a5db03e91b4df0ea55d9a..a8054295b41c1f6d0008c4f0a9fadb6f04c647fc 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py
@@ -19,93 +19,189 @@ import unittest
 from op_test import OpTest
 
 
+def test_static_layer(place,
+                      input_np,
+                      label_np,
+                      reduction='mean',
+                      weight_np=None):
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        input = paddle.data(name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        if weight_np is not None:
+            weight = paddle.data(
+                name='weight', shape=weight_np.shape, dtype='float64')
+            bce_loss = paddle.nn.loss.BCELoss(
+                weight=weight, reduction=reduction)
+        else:
+            bce_loss = paddle.nn.loss.BCELoss(reduction=reduction)
+        res = bce_loss(input, label)
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog,
+                                feed={"input": input_np,
+                                      "label": label_np}
+                                if weight_np is None else {
+                                    "input": input_np,
+                                    "label": label_np,
+                                    "weight": weight_np
+                                },
+                                fetch_list=[res])
+    return static_result
+
+
+def test_static_functional(place,
+                           input_np,
+                           label_np,
+                           reduction='mean',
+                           weight_np=None):
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        input = paddle.data(name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        if weight_np is not None:
+            weight = paddle.data(
+                name='weight', shape=weight_np.shape, dtype='float64')
+            res = paddle.nn.functional.binary_cross_entropy(
+                input, label, weight=weight, reduction=reduction)
+        else:
+            res = paddle.nn.functional.binary_cross_entropy(
+                input, label, reduction=reduction)
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog,
+                                feed={"input": input_np,
+                                      "label": label_np}
+                                if weight_np is None else {
+                                    "input": input_np,
+                                    "label": label_np,
+                                    "weight": weight_np
+                                },
+                                fetch_list=[res])
+    return static_result
+
+
+def test_dygraph_layer(place,
+                       input_np,
+                       label_np,
+                       reduction='mean',
+                       weight_np=None):
+    paddle.disable_static()
+    if weight_np is not None:
+        weight = paddle.to_tensor(weight_np)
+        bce_loss = paddle.nn.loss.BCELoss(weight=weight, reduction=reduction)
+    else:
+        bce_loss = paddle.nn.loss.BCELoss(reduction=reduction)
+    dy_res = bce_loss(paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def test_dygraph_functional(place,
+                            input_np,
+                            label_np,
+                            reduction='mean',
+                            weight_np=None):
+    paddle.disable_static()
+    input = paddle.to_tensor(input_np)
+    label = paddle.to_tensor(label_np)
+
+    if weight_np is not None:
+        weight = paddle.to_tensor(weight_np)
+        dy_res = paddle.nn.functional.binary_cross_entropy(
+            input, label, weight=weight, reduction=reduction)
+    else:
+        dy_res = paddle.nn.functional.binary_cross_entropy(
+            input, label, reduction=reduction)
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def calc_bceloss(input_np, label_np, reduction='mean', weight_np=None):
+    if weight_np is None:
+        expected = -1 * (label_np * np.log(input_np) +
+                         (1. - label_np) * np.log(1. - input_np))
+    else:
+        expected = -1 * weight_np * (label_np * np.log(input_np) +
+                                     (1. - label_np) * np.log(1. - input_np))
+
+    if reduction == 'mean':
+        expected = np.mean(expected)
+    elif reduction == 'sum':
+        expected = np.sum(expected)
+    else:
+        expected = expected
+
+    return expected
+
+
 class TestBCELoss(unittest.TestCase):
     def test_BCELoss(self):
-        input_np = np.random.random(size=(20, 30)).astype(np.float64)
-        label_np = np.random.random(size=(20, 30)).astype(np.float64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
+        input_np = np.random.uniform(0.1, 0.8, size=(20, 30)).astype(np.float64)
+        label_np = np.random.randint(0, 2, size=(20, 30)).astype(np.float64)
         places = [fluid.CPUPlace()]
         if fluid.core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         reductions = ['sum', 'mean', 'none']
         for place in places:
-            for red in reductions:
-                with fluid.program_guard(prog, startup_prog):
-                    input = fluid.data(
-                        name='input', shape=[None, 30], dtype='float64')
-                    label = fluid.data(
-                        name='label', shape=[None, 30], dtype='float64')
-                    bce_loss = paddle.nn.loss.BCELoss(reduction=red)
-                    res = bce_loss(input, label)
-
-                    exe = fluid.Executor(place)
-                    static_result = exe.run(
-                        prog,
-                        feed={"input": input_np,
-                              "label": label_np},
-                        fetch_list=[res])
-
-                with fluid.dygraph.guard():
-                    bce_loss = paddle.nn.loss.BCELoss(reduction=red)
-                    dy_res = bce_loss(
-                        fluid.dygraph.to_variable(input_np),
-                        fluid.dygraph.to_variable(label_np))
-                    dy_result = dy_res.numpy()
-
-                expected = -1 * (label_np * np.log(input_np) +
-                                 (1. - label_np) * np.log(1. - input_np))
-                if red == 'mean':
-                    expected = np.mean(expected)
-                elif red == 'sum':
-                    expected = np.sum(expected)
-                else:
-                    expected = expected
+            for reduction in reductions:
+                static_result = test_static_layer(place, input_np, label_np,
+                                                  reduction)
+                dy_result = test_dygraph_layer(place, input_np, label_np,
+                                               reduction)
+                expected = calc_bceloss(input_np, label_np, reduction)
                 self.assertTrue(np.allclose(static_result, expected))
                 self.assertTrue(np.allclose(static_result, dy_result))
                 self.assertTrue(np.allclose(dy_result, expected))
+                static_functional = test_static_functional(place, input_np,
+                                                           label_np, reduction)
+                dy_functional = test_dygraph_functional(place, input_np,
+                                                        label_np, reduction)
+                self.assertTrue(np.allclose(static_functional, expected))
+                self.assertTrue(np.allclose(static_functional, dy_functional))
+                self.assertTrue(np.allclose(dy_functional, expected))
 
     def test_BCELoss_weight(self):
-        input_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
-        label_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
+        input_np = np.random.uniform(
+            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(
+            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
         weight_np = np.random.random(size=(3, 4, 10)).astype(np.float64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[None, 3, 4, 10], dtype='float64')
-            label = fluid.data(
-                name='label', shape=[None, 3, 4, 10], dtype='float64')
-            weight = fluid.data(
-                name='weight', shape=[3, 4, 10], dtype='float64')
-            bce_loss = paddle.nn.loss.BCELoss(weight=weight)
-            res = bce_loss(input, label)
-
-            exe = fluid.Executor(place)
-            static_result = exe.run(prog,
-                                    feed={
-                                        "input": input_np,
-                                        "label": label_np,
-                                        "weight": weight_np
-                                    },
-                                    fetch_list=[res])
-
-        with fluid.dygraph.guard():
-            bce_loss = paddle.nn.loss.BCELoss(
-                weight=fluid.dygraph.to_variable(weight_np))
-            dy_res = bce_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
-            dy_result = dy_res.numpy()
-
-        expected = np.mean(-1 * weight_np *
-                           (label_np * np.log(input_np) +
-                            (1. - label_np) * np.log(1. - input_np)))
-        self.assertTrue(np.allclose(static_result, expected))
-        self.assertTrue(np.allclose(static_result, dy_result))
-        self.assertTrue(np.allclose(dy_result, expected))
+        for reduction in ['sum', 'mean', 'none']:
+            static_result = test_static_layer(
+                place, input_np, label_np, reduction, weight_np=weight_np)
+            dy_result = test_dygraph_layer(
+                place, input_np, label_np, reduction, weight_np=weight_np)
+            expected = calc_bceloss(
+                input_np, label_np, reduction, weight_np=weight_np)
+            self.assertTrue(np.allclose(static_result, expected))
+            self.assertTrue(np.allclose(static_result, dy_result))
+            self.assertTrue(np.allclose(dy_result, expected))
+            static_functional = test_static_functional(
+                place, input_np, label_np, reduction, weight_np=weight_np)
+            dy_functional = test_dygraph_functional(
+                place, input_np, label_np, reduction, weight_np=weight_np)
+            self.assertTrue(np.allclose(static_functional, expected))
+            self.assertTrue(np.allclose(static_functional, dy_functional))
+            self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_BCELoss_error(self):
+        paddle.disable_static()
+        self.assertRaises(
+            ValueError, paddle.nn.loss.BCELoss, reduction="unsupport reduction")
+        input = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
+        label = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.binary_cross_entropy,
+            input=input,
+            label=label,
+            reduction="unsupport reduction")
+        paddle.enable_static()
 
 
 def bce_loss(input, label):
diff --git a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ba13a6da01c7dbf8b0e854df43f11b19a4ebd4c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+from op_test import OpTest
+
+
+def call_bce_layer(logit, label, weight=None, reduction='mean',
+                   pos_weight=None):
+    bce_logit_loss = paddle.nn.loss.BCEWithLogitsLoss(
+        weight=weight, reduction=reduction, pos_weight=pos_weight)
+    res = bce_logit_loss(logit, label)
+    return res
+
+
+def call_bce_functional(logit,
+                        label,
+                        weight=None,
+                        reduction='mean',
+                        pos_weight=None):
+    res = paddle.nn.functional.binary_cross_entropy_with_logits(
+        logit, label, weight=weight, reduction=reduction, pos_weight=pos_weight)
+    return res
+
+
+def test_static(place,
+                logit_np,
+                label_np,
+                weight_np=None,
+                reduction='mean',
+                pos_weight_np=None,
+                functional=False):
+    paddle.enable_static()
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        logit = paddle.data(name='logit', shape=logit_np.shape, dtype='float64')
+        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        feed_dict = {"logit": logit_np, "label": label_np}
+
+        pos_weight = None
+        weight = None
+        if pos_weight_np is not None:
+            pos_weight = paddle.data(
+                name='pos_weight', shape=pos_weight_np.shape, dtype='float64')
+            feed_dict["pos_weight"] = pos_weight_np
+        if weight_np is not None:
+            weight = paddle.data(
+                name='weight', shape=weight_np.shape, dtype='float64')
+            feed_dict["weight"] = weight_np
+        if functional:
+            res = call_bce_functional(logit, label, weight, reduction,
+                                      pos_weight)
+        else:
+            res = call_bce_layer(logit, label, weight, reduction, pos_weight)
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog, feed=feed_dict, fetch_list=[res])
+    return static_result
+
+
+def test_dygraph(place,
+                 logit_np,
+                 label_np,
+                 weight_np=None,
+                 reduction='mean',
+                 pos_weight_np=None,
+                 functional=False):
+    paddle.disable_static()
+    logit = paddle.to_tensor(logit_np)
+    label = paddle.to_tensor(label_np)
+    weight = None
+    pos_weight = None
+    if weight_np is not None:
+        weight = paddle.to_tensor(weight_np)
+    if pos_weight_np is not None:
+        pos_weight = paddle.to_tensor(pos_weight_np)
+    if functional:
+        dy_res = call_bce_functional(logit, label, weight, reduction,
+                                     pos_weight)
+    else:
+        dy_res = call_bce_layer(logit, label, weight, reduction, pos_weight)
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def calc_bce_with_logits_loss(logit_np,
+                              label_np,
+                              reduction='mean',
+                              weight_np=None,
+                              pos_weight=None):
+    expected = np.maximum(
+        logit_np,
+        0) - logit_np * label_np + np.log(1 + np.exp(-np.abs(logit_np)))
+    if pos_weight is not None:
+        expected = expected * ((pos_weight - 1) * label_np + 1)
+    if weight_np is not None:
+        expected = weight_np * expected
+
+    if reduction == 'mean':
+        expected = np.mean(expected)
+    elif reduction == 'sum':
+        expected = np.sum(expected)
+    else:
+        expected = expected
+
+    return expected
+
+
+class TestBCEWithLogitsLoss(unittest.TestCase):
+    def test_BCEWithLogitsLoss(self):
+        logit_np = np.random.uniform(0.1, 0.8, size=(20, 30)).astype(np.float64)
+        label_np = np.random.randint(0, 2, size=(20, 30)).astype(np.float64)
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        reductions = ['sum', 'mean', 'none']
+        for place in places:
+            for reduction in reductions:
+                static_result = test_static(
+                    place, logit_np, label_np, reduction=reduction)
+                dy_result = test_dygraph(
+                    place, logit_np, label_np, reduction=reduction)
+                expected = calc_bce_with_logits_loss(logit_np, label_np,
+                                                     reduction)
+                self.assertTrue(np.allclose(static_result, expected))
+                self.assertTrue(np.allclose(static_result, dy_result))
+                self.assertTrue(np.allclose(dy_result, expected))
+                static_functional = test_static(
+                    place,
+                    logit_np,
+                    label_np,
+                    reduction=reduction,
+                    functional=True)
+                dy_functional = test_dygraph(
+                    place,
+                    logit_np,
+                    label_np,
+                    reduction=reduction,
+                    functional=True)
+                self.assertTrue(np.allclose(static_functional, expected))
+                self.assertTrue(np.allclose(static_functional, dy_functional))
+                self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_BCEWithLogitsLoss_weight(self):
+        logit_np = np.random.uniform(
+            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(
+            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
+        weight_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        for reduction in ['sum', 'mean', 'none']:
+            static_result = test_static(
+                place,
+                logit_np,
+                label_np,
+                weight_np=weight_np,
+                reduction=reduction)
+            dy_result = test_dygraph(
+                place,
+                logit_np,
+                label_np,
+                weight_np=weight_np,
+                reduction=reduction)
+            expected = calc_bce_with_logits_loss(
+                logit_np, label_np, reduction, weight_np=weight_np)
+            self.assertTrue(np.allclose(static_result, expected))
+            self.assertTrue(np.allclose(static_result, dy_result))
+            self.assertTrue(np.allclose(dy_result, expected))
+            static_functional = test_static(
+                place,
+                logit_np,
+                label_np,
+                weight_np=weight_np,
+                reduction=reduction,
+                functional=True)
+            dy_functional = test_dygraph(
+                place,
+                logit_np,
+                label_np,
+                weight_np=weight_np,
+                reduction=reduction,
+                functional=True)
+            self.assertTrue(np.allclose(static_functional, expected))
+            self.assertTrue(np.allclose(static_functional, dy_functional))
+            self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_BCEWithLogitsLoss_pos_weight(self):
+        logit_np = np.random.uniform(
+            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(
+            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
+        pos_weight_np = np.random.random(size=(3, 4, 10)).astype(np.float64)
+        weight_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        reduction = "mean"
+        static_result = test_static(place, logit_np, label_np, weight_np,
+                                    reduction, pos_weight_np)
+        dy_result = test_dygraph(place, logit_np, label_np, weight_np,
+                                 reduction, pos_weight_np)
+        expected = calc_bce_with_logits_loss(logit_np, label_np, reduction,
+                                             weight_np, pos_weight_np)
+        self.assertTrue(np.allclose(static_result, expected))
+        self.assertTrue(np.allclose(static_result, dy_result))
+        self.assertTrue(np.allclose(dy_result, expected))
+        static_functional = test_static(
+            place,
+            logit_np,
+            label_np,
+            weight_np,
+            reduction,
+            pos_weight_np,
+            functional=True)
+        dy_functional = test_dygraph(
+            place,
+            logit_np,
+            label_np,
+            weight_np,
+            reduction,
+            pos_weight_np,
+            functional=True)
+        self.assertTrue(np.allclose(static_functional, expected))
+        self.assertTrue(np.allclose(static_functional, dy_functional))
+        self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_BCEWithLogitsLoss_error(self):
+        paddle.disable_static()
+        self.assertRaises(
+            ValueError,
+            paddle.nn.BCEWithLogitsLoss,
+            reduction="unsupport reduction")
+        logit = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
+        label = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.binary_cross_entropy_with_logits,
+            logit=logit,
+            label=label,
+            reduction="unsupport reduction")
+        paddle.enable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..12a29de80426639ab3a9d2b879bb88a461ba2ab4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -0,0 +1,76 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+from op_test import OpTest
+import numpy as np
+
+
+def output_hist(out):
+    hist, _ = np.histogram(out, bins=2)
+    hist = hist.astype("float32")
+    hist /= float(out.size)
+    prob = 0.5 * np.ones((2))
+    return hist, prob
+
+
+class TestBernoulliOp(OpTest):
+    def setUp(self):
+        self.op_type = "bernoulli"
+        self.inputs = {"X": np.random.uniform(size=(1000, 784))}
+        self.init_attrs()
+        self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
+
+    def init_attrs(self):
+        self.attrs = {}
+        self.output_hist = output_hist
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        hist, prob = self.output_hist(np.array(outs[0]))
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestBernoulliApi(unittest.TestCase):
+    def test_dygraph(self):
+        paddle.disable_static()
+        x = paddle.rand([1024, 1024])
+        out = paddle.bernoulli(x)
+        paddle.enable_static()
+        hist, prob = output_hist(out.numpy())
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+    def test_static(self):
+        x = paddle.rand([1024, 1024])
+        out = paddle.bernoulli(x)
+        exe = paddle.static.Executor(paddle.CPUPlace())
+        out = exe.run(paddle.static.default_main_program(),
+                      fetch_list=[out.name])
+        hist, prob = output_hist(out[0])
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..01daea32167d28edbb46d6854872976aed79494e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
@@ -0,0 +1,504 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle
+from paddle.fluid import Program, program_guard
+from paddle.nn.functional import interpolate
+
+
+def cubic_1(x, a):
+    return ((a + 2) * x - (a + 3)) * x * x + 1
+
+
+def cubic_2(x, a):
+    return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a
+
+
+def cubic_interp1d(x0, x1, x2, x3, t):
+    param = [0, 0, 0, 0]
+    a = -0.75
+    x_1 = t
+    x_2 = 1.0 - t
+    param[0] = cubic_2(x_1 + 1.0, a)
+    param[1] = cubic_1(x_1, a)
+    param[2] = cubic_1(x_2, a)
+    param[3] = cubic_2(x_2 + 1.0, a)
+    return x0 * param[0] + x1 * param[1] + x2 * param[2] + x3 * param[3]
+
+
+def value_bound(input, w, h, x, y):
+    access_x = int(max(min(x, w - 1), 0))
+    access_y = int(max(min(y, h - 1), 0))
+    return input[:, :, access_y, access_x]
+
+
+def bicubic_interp_np(input,
+                      out_h,
+                      out_w,
+                      out_size=None,
+                      actual_shape=None,
+                      align_corners=True,
+                      data_layout='kNCHW'):
+    """trilinear interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    batch_size, channel, in_h, in_w = input.shape
+
+    ratio_h = ratio_w = 0.0
+    if out_h > 1:
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+
+    if out_w > 1:
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((batch_size, channel, out_h, out_w))
+
+    for k in range(out_h):
+        if (align_corners):
+            h = ratio_h * k
+        else:
+            h = ratio_h * (k + 0.5) - 0.5
+        input_y = np.floor(h)
+        y_t = h - input_y
+        for l in range(out_w):
+            if (align_corners):
+                w = ratio_w * l
+            else:
+                w = ratio_w * (l + 0.5) - 0.5
+            input_x = np.floor(w)
+            x_t = w - input_x
+            for i in range(batch_size):
+                for j in range(channel):
+                    coefficients = [0, 0, 0, 0]
+                    for ii in range(4):
+                        access_x_0 = int(max(min(input_x - 1, in_w - 1), 0))
+                        access_x_1 = int(max(min(input_x + 0, in_w - 1), 0))
+                        access_x_2 = int(max(min(input_x + 1, in_w - 1), 0))
+                        access_x_3 = int(max(min(input_x + 2, in_w - 1), 0))
+                        access_y = int(max(min(input_y - 1 + ii, in_h - 1), 0))
+
+                        coefficients[ii] = cubic_interp1d(
+                            input[i, j, access_y, access_x_0],
+                            input[i, j, access_y, access_x_1],
+                            input[i, j, access_y, access_x_2],
+                            input[i, j, access_y, access_x_3], x_t)
+                    out[i, j, k, l] = cubic_interp1d(
+                        coefficients[0], coefficients[1], coefficients[2],
+                        coefficients[3], y_t)
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+    return out.astype(input.dtype)
+
+
+class TestBicubicInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "bicubic_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bicubic_interp_np(input_np, out_h, out_w, self.out_size,
+                                      self.actual_shape, self.align_corners,
+                                      self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'data_layout': self.data_layout
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [2, 3, 5, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+
+
+class TestBicubicInterpCase1(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestBicubicInterpCase2(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 8
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestBicubicInterpCase3(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.align_corners = False
+
+
+class TestBicubicInterpCase4(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
+
+
+class TestBicubicInterpCase5(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 11
+        self.out_w = 11
+        self.scale = 0.
+        self.out_size = np.array([6, 4]).astype("int32")
+        self.align_corners = False
+
+
+class TestBicubicInterpCase6(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0
+        self.out_size = np.array([64, 32]).astype("int32")
+        self.align_corners = False
+
+
+class TestBicubicInterpSame(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestBicubicInterpScale(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = [1., 1.]
+        self.align_corners = True
+
+
+class TestBicubicInterpDataLayout(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [2, 5, 5, 3]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.data_layout = "NHWC"
+
+
+class TestBicubicInterpOpAPI(unittest.TestCase):
+    def test_case(self):
+        np.random.seed(200)
+        x_data = np.random.random((2, 3, 6, 6)).astype("float32")
+        dim_data = np.array([12]).astype("int32")
+        shape_data = np.array([12, 12]).astype("int32")
+        actual_size_data = np.array([12, 12]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        with fluid.program_guard(prog, startup_prog):
+
+            x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+
+            dim = fluid.data(name="dim", shape=[1], dtype="int32")
+            shape_tensor = fluid.data(
+                name="shape_tensor", shape=[2], dtype="int32")
+            actual_size = fluid.data(
+                name="actual_size", shape=[2], dtype="int32")
+            scale_tensor = fluid.data(
+                name="scale_tensor", shape=[1], dtype="float32")
+
+            out1 = interpolate(
+                x, size=[12, 12], mode='bicubic', align_corners=False)
+            out2 = interpolate(
+                x, size=[12, dim], mode='bicubic', align_corners=False)
+            out3 = interpolate(
+                x, size=shape_tensor, mode='bicubic', align_corners=False)
+            out4 = interpolate(
+                x, size=[12, 12], mode='bicubic', align_corners=False)
+            out5 = interpolate(
+                x,
+                scale_factor=scale_tensor,
+                mode='bicubic',
+                align_corners=False)
+            out6 = interpolate(
+                x, scale_factor=2.0, mode='bicubic', align_corners=False)
+            out7 = interpolate(
+                x, scale_factor=[2.0, 2.0], mode='bicubic', align_corners=False)
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            results = exe.run(
+                fluid.default_main_program(),
+                feed={
+                    "x": x_data,
+                    "dim": dim_data,
+                    "shape_tensor": shape_data,
+                    "actual_size": actual_size_data,
+                    "scale_tensor": scale_data
+                },
+                fetch_list=[out1, out2, out3, out4, out5, out6, out7],
+                return_numpy=True)
+
+            expect_res = bicubic_interp_np(
+                x_data, out_h=12, out_w=12, align_corners=False)
+            for res in results:
+                self.assertTrue(np.allclose(res, expect_res))
+
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(x_data)
+            interp = interpolate(
+                x, size=[12, 12], mode='bicubic', align_corners=False)
+            dy_result = interp.numpy()
+            expect = bicubic_interp_np(
+                x_data, out_h=12, out_w=12, align_corners=False)
+            self.assertTrue(np.allclose(dy_result, expect))
+
+
+class TestBicubicOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of interpoalte must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, interpolate, x1)
+
+            def test_mode_type():
+                # mode must be "BILINEAR" "TRILINEAR" "NEAREST" "BICUBIC"
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+
+                out = interpolate(
+                    x, size=[12, 12], mode='UNKONWN', align_corners=False)
+
+            def test_input_shape():
+                x = fluid.data(name="x", shape=[2], dtype="float32")
+                out = interpolate(
+                    x, size=[12, 12], mode='BICUBIC', align_corners=False)
+
+            def test_align_corcers():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                interpolate(x, size=[12, 12], mode='BICUBIC', align_corners=3)
+
+            def test_out_shape():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x, size=[12], mode='bicubic', align_corners=False)
+
+            def test_attr_data_format():
+                # for 5-D input, data_format only can be NCDHW or NDHWC
+                input = fluid.data(
+                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
+                out = interpolate(
+                    input,
+                    size=[4, 8, 4, 5],
+                    mode='trilinear',
+                    data_format='NHWC')
+
+            def test_actual_shape():
+                # the actual_shape  must be Variable.
+                x = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                out = interpolate(
+                    x, size=[12, 12], mode='BICUBIC', align_corners=False)
+
+            def test_scale_value():
+                # the scale must be greater than zero.
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='BICUBIC',
+                    align_corners=False,
+                    scale_factor=-2.0)
+
+            def test_attr_5D_input():
+                # for 5-D input, data_format only can be NCDHW or NDHWC
+                input = fluid.data(
+                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
+                out = interpolate(
+                    input,
+                    size=[4, 8, 4, 5],
+                    mode='trilinear',
+                    data_format='NDHWC')
+
+            def test_scale_type():
+                # the scale must be greater than zero.
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                scale = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='bicubic',
+                    align_corners=False,
+                    scale_factor=scale)
+
+            def test_align_mode():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='nearest',
+                    align_corners=False,
+                    align_mode=2,
+                    scale_factor=1.0)
+
+            def test_outshape_and_scale():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='bicubic',
+                    align_corners=False,
+                    scale_factor=None)
+
+            def test_align_corners_and_nearest():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='nearest',
+                    align_corners=True,
+                    scale_factor=None)
+
+            def test_scale_shape():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='nearest',
+                    align_corners=False,
+                    scale_factor=[1, 2, 2])
+
+            def test_scale_value():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='trilinear',
+                    align_corners=False,
+                    scale_factor=[1, 2, 2])
+
+            self.assertRaises(ValueError, test_mode_type)
+            self.assertRaises(ValueError, test_input_shape)
+            self.assertRaises(TypeError, test_align_corcers)
+            self.assertRaises(ValueError, test_attr_data_format)
+            self.assertRaises(TypeError, test_actual_shape)
+            self.assertRaises(ValueError, test_scale_value)
+            self.assertRaises(ValueError, test_out_shape)
+            self.assertRaises(ValueError, test_attr_5D_input)
+            self.assertRaises(TypeError, test_scale_type)
+            self.assertRaises(ValueError, test_align_mode)
+            self.assertRaises(ValueError, test_outshape_and_scale)
+            self.assertRaises(ValueError, test_align_corners_and_nearest)
+            self.assertRaises(ValueError, test_scale_shape)
+            self.assertRaises(ValueError, test_scale_value)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_api.py b/python/paddle/fluid/tests/unittests/test_bilinear_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..24eae4797de85f371ed62e78c85b160f698ee9eb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_api.py
@@ -0,0 +1,65 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from op_test import OpTest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+
+
+class TestBilinearAPI(unittest.TestCase):
+    def test_api(self):
+        with fluid.program_guard(fluid.default_startup_program(),
+                                 fluid.default_main_program()):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+            else:
+                place = core.CPUPlace()
+            exe = fluid.Executor(place)
+
+            data1 = fluid.data(name='X1', shape=[5, 5], dtype='float32')
+            data2 = fluid.data(name='X2', shape=[5, 4], dtype='float32')
+
+            layer1 = np.random.random((5, 5)).astype('float32')
+            layer2 = np.random.random((5, 4)).astype('float32')
+
+            bilinear = paddle.nn.Bilinear(
+                in1_features=5, in2_features=4, out_features=1000)
+            ret = bilinear(data1, data2)
+
+            exe.run(fluid.default_startup_program())
+            ret_fetch = exe.run(feed={'X1': layer1,
+                                      'X2': layer2},
+                                fetch_list=[ret.name])
+            self.assertEqual(ret_fetch[0].shape, (5, 1000))
+
+
+class TestBilinearAPIDygraph(unittest.TestCase):
+    def test_api(self):
+        paddle.disable_static()
+        layer1 = np.random.random((5, 5)).astype('float32')
+        layer2 = np.random.random((5, 4)).astype('float32')
+        bilinear = paddle.nn.Bilinear(
+            in1_features=5, in2_features=4, out_features=1000)
+        ret = bilinear(paddle.to_tensor(layer1), paddle.to_tensor(layer2))
+        self.assertEqual(ret.shape, [5, 1000])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..d139a53c7e2ccc68964457f3142b4ed890d339f2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
@@ -0,0 +1,620 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.nn.functional import interpolate
+import paddle
+
+
+def bilinear_interp_np(input,
+                       out_h,
+                       out_w,
+                       out_size=None,
+                       actual_shape=None,
+                       align_corners=True,
+                       align_mode=0,
+                       data_layout='NCHW'):
+    """bilinear interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    batch_size, channel, in_h, in_w = input.shape
+
+    ratio_h = ratio_w = 0.0
+    if out_h > 1:
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+    if out_w > 1:
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((batch_size, channel, out_h, out_w))
+
+    for i in range(out_h):
+        if (align_mode == 0 and not align_corners):
+            h = int(ratio_h * (i + 0.5) - 0.5)
+        else:
+            h = int(ratio_h * i)
+
+        h = max(0, h)
+        hid = 1 if h < in_h - 1 else 0
+        if (align_mode == 0 and not align_corners):
+            idx_src_h = max(ratio_h * (i + 0.5) - 0.5, 0)
+            h1lambda = idx_src_h - h
+        else:
+            h1lambda = ratio_h * i - h
+        h2lambda = 1.0 - h1lambda
+        for j in range(out_w):
+            if (align_mode == 0 and not align_corners):
+                w = int(ratio_w * (j + 0.5) - 0.5)
+            else:
+                w = int(ratio_w * j)
+            w = max(0, w)
+            wid = 1 if w < in_w - 1 else 0
+            if (align_mode == 0 and not align_corners):
+                idx_src_w = max(ratio_w * (j + 0.5) - 0.5, 0)
+                w1lambda = idx_src_w - w
+            else:
+                w1lambda = ratio_w * j - w
+            w2lambda = 1.0 - w1lambda
+
+            out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
+                                        w1lambda*input[:, :, h, w+wid]) + \
+                h1lambda*(w2lambda*input[:, :, h+hid, w] +
+                          w1lambda*input[:, :, h+hid, w+wid])
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+
+    return out.astype(input.dtype)
+
+
+class TestBilinearInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "bilinear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
+                                       self.actual_shape, self.align_corners,
+                                       self.align_mode, self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': self.data_layout
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase1(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase2(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase3(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase4(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase5(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = np.array([11, 11]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase6(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([65, 33]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpSame(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpActualShape(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpDataLayout(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 5, 5, 3]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+        self.data_layout = "NHWC"
+
+
+class TestBilinearInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "bilinear_interp_v2"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(self.input_shape[2] * scale_h)
+            out_w = int(self.input_shape[3] * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
+                                       self.actual_shape, self.align_corners,
+                                       self.align_mode)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.scale = 0.
+        self.out_size = np.array([6, 15]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = True
+        self.align_mode = 0
+
+
+class TestBilinearInterpScale1(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 2.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale2(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale3(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.5
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale4(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = [1.5, 0.5]
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpZero(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 0.2
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestBilinearInterpOp_attr_tensor(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "bilinear_interp_v2"
+        self.shape_by_1Dtensor = False
+        self.scale_by_1Dtensor = False
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+        }
+
+        input_np = np.random.random(self.input_shape).astype("float64")
+        self.inputs = {'X': input_np}
+
+        if self.scale_by_1Dtensor:
+            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
+        elif self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(self.input_shape[2] * scale_h)
+            out_w = int(self.input_shape[3] * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        if self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.out_size
+        elif self.out_size is not None:
+            size_tensor = []
+            for index, ele in enumerate(self.out_size):
+                size_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+            self.inputs['SizeTensor'] = size_tensor
+
+        self.attrs['out_h'] = self.out_h
+        self.attrs['out_w'] = self.out_w
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
+                                       self.actual_shape, self.align_corners)
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 5]
+        self.out_h = 3
+        self.out_w = 3
+        self.scale = 0.
+        self.out_size = [3, 3]
+        self.align_corners = True
+
+
+# out_size is a 1-D tensor
+class TestBilinearInterp_attr_tensor_Case1(TestBilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = [8, 12]
+        self.align_corners = True
+
+
+# scale is a 1-D tensor
+class TestBilinearInterp_attr_tensor_Case2(TestBilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+        self.shape_by_1Dtensor = True
+
+
+# scale is a 1-D tensor
+class TestBilinearInterp_attr_tensor_Case3(TestBilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 2.0
+        self.out_size = None
+        self.align_corners = True
+        self.scale_by_1Dtensor = True
+
+
+class TestBilinearInterpOpAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
+        scale_tensor = fluid.data(
+            name="scale_tensor", shape=[1], dtype="float32")
+
+        out1 = fluid.layers.resize_bilinear(x, out_shape=[12, 12])
+        out2 = fluid.layers.resize_bilinear(x, out_shape=[12, dim])
+        out3 = fluid.layers.resize_bilinear(x, out_shape=shape_tensor)
+        out4 = fluid.layers.resize_bilinear(
+            x, out_shape=[4, 4], actual_shape=actual_size)
+        out5 = fluid.layers.resize_bilinear(x, scale=scale_tensor)
+
+        x_data = np.random.random((2, 3, 6, 6)).astype("float32")
+        dim_data = np.array([12]).astype("int32")
+        shape_data = np.array([12, 12]).astype("int32")
+        actual_size_data = np.array([12, 12]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "x": x_data,
+                              "dim": dim_data,
+                              "shape_tensor": shape_data,
+                              "actual_size": actual_size_data,
+                              "scale_tensor": scale_data
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5],
+                          return_numpy=True)
+
+        expect_res = bilinear_interp_np(
+            x_data, out_h=12, out_w=12, align_corners=True)
+        for res in results:
+            self.assertTrue(np.allclose(res, expect_res))
+
+
+class TestUpsampleBilinear2dInterpOpAPI2_0(unittest.TestCase):
+    def test_case(self):
+
+        # dygraph
+        x_data = np.random.random((1, 3, 6, 6)).astype("float32")
+        upsample = paddle.nn.UpsamplingBilinear2d(scale_factor=[2, 2])
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(x_data)
+            interp = upsample(x)
+            expect = bilinear_interp_np(
+                x_data, out_h=12, out_w=12, align_corners=True)
+            self.assertTrue(np.allclose(interp.numpy(), expect))
+
+
+class TestBilinearInterpOpAPI_dy(unittest.TestCase):
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("float32")
+            input_x = paddle.to_tensor(input_data)
+            expect_res = bilinear_interp_np(
+                input_data, out_h=12, out_w=12, align_corners=False)
+            out = interpolate(
+                x=input_x, size=[12, 12], mode="bilinear", align_corners=False)
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index 5cc8e2ba15d260b988ee66a5711aed42ca04c10b..cc2b1165ec304a63671b48d4702142ea38c9a2c1 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -65,7 +65,7 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
     """Bipartite Matching algorithm for batch input.
     Arg:
         distance (numpy.array) : The distance of two entries with shape [M, N].
-        lod (list of int): The offsets of each input in this batch.
+        lod (list of int): The length of each input in this batch.
     """
     n = len(lod)
     m = distance.shape[1]
@@ -73,6 +73,7 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
     match_dist = np.zeros((n, m), dtype=np.float32)
     cur_offset = 0
     for i in range(n):
+        if lod[i] == 0: continue
         bipartite_match(distance[cur_offset:(cur_offset + lod[i]), :],
                         match_indices[i, :], match_dist[i, :])
         if match_type == 'per_prediction':
@@ -155,5 +156,22 @@ class TestBipartiteMatchOpWithPerPredictionType(OpTest):
         self.check_output()
 
 
+class TestBipartiteMatchOpWithEmptyLoD(OpTest):
+    def setUp(self):
+        self.op_type = 'bipartite_match'
+        lod = [[5, 6, 0, 12]]
+        dist = np.random.random((23, 217)).astype('float32')
+        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
+
+        self.inputs = {'DistMat': (dist, lod)}
+        self.outputs = {
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_dist,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index 671efd8c721550256c181059528bead43deb0718..43d485a0a6d24be6e8db32f16fe96a70bb229858 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import Parameter
 import numpy as np
@@ -44,10 +45,10 @@ class InplaceTestBase(unittest.TestCase):
 
     def build_program_and_scope(self):
         self.place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         startup_program = fluid.Program()
         main_program = fluid.Program()
-        startup_program.random_seed = 1
-        main_program.random_seed = 1
 
         scope = fluid.Scope()
         with fluid.program_guard(main_program, startup_program):
diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
index f3e6c079eedc8effc948a44e08a5dcdcae8d3081..ab08a0aacbf08768ffff43974ee9a7c7dd4a7288 100644
--- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
@@ -100,5 +100,45 @@ class TestDygraph(unittest.TestCase):
         out = paddle.cholesky(x, upper=False)
 
 
+class TestCholeskySingularAPI(unittest.TestCase):
+    def setUp(self):
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place, with_out=False):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[4, 4], dtype="float64")
+            result = paddle.cholesky(input)
+
+            input_np = np.zeros([4, 4]).astype("float64")
+
+            exe = fluid.Executor(place)
+            try:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": input_np},
+                                  fetch_list=[result])
+            except fluid.core.EnforceNotMet as ex:
+                print("The mat is singular")
+                pass
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                                     [[10, 11, 12], [13, 14, 15],
+                                      [16, 17, 18]]]).astype("float64")
+                input = fluid.dygraph.to_variable(input_np)
+                try:
+                    result = paddle.cholesky(input)
+                except fluid.core.EnforceNotMet as ex:
+                    print("The mat is singular")
+                    pass
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clamp.py b/python/paddle/fluid/tests/unittests/test_clamp.py
deleted file mode 100644
index d8d7fe01f8de8686724ea8ebc00491269f2cc0bd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_clamp.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import paddle.tensor as tensor
-import paddle.fluid as fluid
-import numpy as np
-import unittest
-
-
-class TestClampAPI(unittest.TestCase):
-    def test_dygraph_clamp(self):
-        in1 = np.array([[1.2, 3.5], [4.5, 6.4]]).astype('float32')
-        with fluid.dygraph.guard():
-            x1 = fluid.dygraph.to_variable(in1)
-            out1 = tensor.clamp(x1, min=3.5, max=5.0)
-            out2 = tensor.clamp(x1, min=2.5)
-            self.assertTrue(
-                np.allclose(
-                    out1.numpy(), in1.clip(
-                        min=3.5, max=5.0)))
-            self.assertTrue(np.allclose(out2.numpy(), in1.clip(min=2.5)))
-
-    def test_clamp(self):
-        data_shape = [1, 9, 9, 4]
-        data = np.random.random(data_shape).astype('float32')
-        images = fluid.data(name='image', shape=data_shape, dtype='float32')
-        min = fluid.data(name='min', shape=[1], dtype='float32')
-        max = fluid.data(name='max', shape=[1], dtype='float32')
-
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        out_1 = tensor.clamp(images, min=min, max=max)
-        out_2 = tensor.clamp(images, min=0.2, max=0.9)
-        out_3 = tensor.clamp(images, min=0.3)
-        out_4 = tensor.clamp(images, max=0.7)
-        out_5 = tensor.clamp(images, min=min)
-        out_6 = tensor.clamp(images, max=max)
-
-        res1, res2, res3, res4, res5, res6 = exe.run(
-            fluid.default_main_program(),
-            feed={
-                "image": data,
-                "min": np.array([0.2]).astype('float32'),
-                "max": np.array([0.8]).astype('float32')
-            },
-            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6])
-
-        self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
-        self.assertTrue(np.allclose(res2, data.clip(0.2, 0.9)))
-        self.assertTrue(np.allclose(res3, data.clip(min=0.3)))
-        self.assertTrue(np.allclose(res4, data.clip(max=0.7)))
-        self.assertTrue(np.allclose(res5, data.clip(min=0.2)))
-        self.assertTrue(np.allclose(res6, data.clip(max=0.8)))
-
-
-class TestClampError(unittest.TestCase):
-    def test_errors(self):
-        x1 = fluid.layers.data(name='x1', shape=[1], dtype="int16")
-        x2 = fluid.layers.data(name='x2', shape=[1], dtype="int8")
-        self.assertRaises(TypeError, tensor.clamp, x=x1, min=0.2, max=0.8)
-        self.assertRaises(TypeError, tensor.clamp, x=x2, min=0.2, max=0.8)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 33bbd4c8830d689bed513b9ce4084c3d00a923a8..74c01e1424885051faf3e263e6ca26c1269a838e 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 from op_test import OpTest
@@ -92,6 +93,13 @@ class TestCase4(TestClipOp):
         self.inputs['Min'] = np.array([0.3]).astype('float32')
 
 
+class TestCase5(TestClipOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max = 0.5
+        self.min = 0.5
+
+
 class TestClipOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
@@ -109,5 +117,69 @@ class TestClipOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_dtype)
 
 
+class TestClipAPI(unittest.TestCase):
+    def test_clip(self):
+        paddle.enable_static()
+        data_shape = [1, 9, 9, 4]
+        data = np.random.random(data_shape).astype('float32')
+        images = fluid.data(name='image', shape=data_shape, dtype='float32')
+        min = fluid.data(name='min', shape=[1], dtype='float32')
+        max = fluid.data(name='max', shape=[1], dtype='float32')
+
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        out_1 = paddle.clip(images, min=min, max=max)
+        out_2 = paddle.clip(images, min=0.2, max=0.9)
+        out_3 = paddle.clip(images, min=0.3)
+        out_4 = paddle.clip(images, max=0.7)
+        out_5 = paddle.clip(images, min=min)
+        out_6 = paddle.clip(images, max=max)
+        out_7 = paddle.clip(images, max=-1.)
+        out_8 = paddle.clip(images)
+
+        res1, res2, res3, res4, res5, res6, res7, res8 = exe.run(
+            fluid.default_main_program(),
+            feed={
+                "image": data,
+                "min": np.array([0.2]).astype('float32'),
+                "max": np.array([0.8]).astype('float32')
+            },
+            fetch_list=[
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+            ])
+
+        self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
+        self.assertTrue(np.allclose(res2, data.clip(0.2, 0.9)))
+        self.assertTrue(np.allclose(res3, data.clip(min=0.3)))
+        self.assertTrue(np.allclose(res4, data.clip(max=0.7)))
+        self.assertTrue(np.allclose(res5, data.clip(min=0.2)))
+        self.assertTrue(np.allclose(res6, data.clip(max=0.8)))
+        self.assertTrue(np.allclose(res7, data.clip(max=-1)))
+        self.assertTrue(np.allclose(res8, data))
+
+    def test_clip_dygraph(self):
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        paddle.disable_static(place)
+        data_shape = [1, 9, 9, 4]
+        data = np.random.random(data_shape).astype('float32')
+        images = paddle.to_variable(data, dtype='float32')
+
+        out_1 = paddle.clip(images, min=0.2, max=0.8)
+        out_2 = paddle.clip(images, min=0.2, max=0.9)
+
+        self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8)))
+        self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9)))
+
+    def test_errors(self):
+        paddle.enable_static()
+        x1 = fluid.data(name='x1', shape=[1], dtype="int16")
+        x2 = fluid.data(name='x2', shape=[1], dtype="int8")
+        self.assertRaises(TypeError, paddle.clip, x=x1, min=0.2, max=0.8)
+        self.assertRaises(TypeError, paddle.clip, x=x2, min=0.2, max=0.8)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..71777df4651ea26c7cf5dfc7231018288c2670e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveAllgatherAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allgather_nccl(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "nccl")
+
+    def test_allgather_gloo(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "gloo", "3")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..24dd7cacff6adc56eb059a7bec016a1d3e322825
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveAllreduceAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allreduce_nccl(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "nccl")
+
+    def test_allreduce_gloo(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "gloo", "2")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..437b8b7befae470ab438cabc40817996cda5c938
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -0,0 +1,284 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import unittest
+import time
+import argparse
+import os
+import six
+import sys
+import subprocess
+import traceback
+import functools
+import pickle
+from contextlib import closing
+from six import string_types
+import paddle.fluid as fluid
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+
+
+class TestCollectiveAPIRunnerBase(object):
+    def get_model(self, train_prog, startup_prog, rank):
+        raise NotImplementedError(
+            "get model should be implemented by child class.")
+
+    def wait_server_ready(self, endpoints):
+        assert not isinstance(endpoints, string_types)
+        while True:
+            all_ok = True
+            not_ready_endpoints = []
+            for ep in endpoints:
+                ip_port = ep.split(":")
+                with closing(
+                        socket.socket(socket.AF_INET,
+                                      socket.SOCK_STREAM)) as sock:
+                    sock.settimeout(2)
+                    result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                    if result != 0:
+                        all_ok = False
+                        not_ready_endpoints.append(ep)
+            if not all_ok:
+                sys.stderr.write("server not ready, wait 3 sec to retry...\n")
+                sys.stderr.write("not ready endpoints:" + str(
+                    not_ready_endpoints) + "\n")
+                sys.stderr.flush()
+                time.sleep(3)
+            else:
+                break
+
+    def initCommunicator(self, program, rank, nranks, wait_port,
+                         current_endpoint, endpoints):
+        other_endpoints = endpoints[:]
+        other_endpoints.remove(current_endpoint)
+        if rank == 0 and wait_port:
+            self.wait_server_ready(other_endpoints)
+        block = program.global_block()
+        nccl_id_var = block.create_var(
+            name=nameGen.generate('nccl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+
+        block.append_op(
+            type='c_gen_nccl_id',
+            inputs={},
+            outputs={'Out': nccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints
+            })
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': nccl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': self.global_ring_id
+            })
+
+    def run_trainer(self, args):
+        train_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        endpoints = args["endpoints"].split(",")
+        rank = args["trainerid"]
+        current_endpoint = args["currentendpoint"]
+        nranks = 2
+        result = self.get_model(train_prog, startup_prog, rank)
+        if args['backend'] == 'nccl':
+            self.initCommunicator(startup_prog, rank, nranks, True,
+                                  current_endpoint, endpoints)
+            device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+            place = fluid.CUDAPlace(
+                device_id)  #if args.use_gpu else fluid.CPUPlace()
+        else:
+            strategy = fluid.core.GlooParallelStrategy()
+            strategy.rank = rank
+            strategy.rank_num = nranks
+            strategy.prefix = ""
+            strategy.iface = "lo"
+            strategy.init_seconds = 999999
+            strategy.run_seconds = 999999
+            strategy.path = "/tmp/tmp%d" % args['path_id']
+            gloo = fluid.core.GlooParallelContext(strategy)
+            gloo.init()
+            place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        np.random.seed(os.getpid())
+        indata = np.random.random((10, 1000))
+        fetch_list = []
+        for elem in result:
+            fetch_list.append(elem.name)
+        out = exe.run(train_prog,
+                      feed={'tindata': indata},
+                      fetch_list=fetch_list)
+        if six.PY2:
+            print(pickle.dumps(out))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(out))
+
+
+def runtime_main(test_class, col_type):
+    args = {}
+    model = test_class()
+    args["deviceid"] = os.getenv("FLAGS_selected_gpus")
+    args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
+    args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
+    args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
+    args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    args["col_type"] = col_type
+    args["backend"] = os.getenv("BACKEND")
+    args["path_id"] = int(os.getenv("PATH_ID"))
+    model.run_trainer(args)
+
+
+import paddle.compat as cpt
+import socket
+from contextlib import closing
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._port_set = set()
+        self._trainers = 2
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
+        self._python_interp = sys.executable
+
+    def _find_free_port(self):
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def _run_cluster(self, model_file, envs):
+        worker_endpoints = self._ps_endpoints.split(",")
+        w0_ep, w1_ep = worker_endpoints
+        #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
+        env0 = {
+            "FLAGS_selected_gpus": "0",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w0_ep
+        }
+
+        env1 = {
+            "FLAGS_selected_gpus": "1",
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w1_ep
+        }
+        #update environment
+        env0.update(envs)
+        env1.update(envs)
+        tr_cmd = "%s %s"
+        tr0_cmd = tr_cmd % (self._python_interp, model_file)
+        tr1_cmd = tr_cmd % (self._python_interp, model_file)
+        tr0_pipe = open("/tmp/tr0_err.log", "wb")
+        tr1_pipe = open("/tmp/tr1_err.log", "wb")
+        #print(tr0_cmd) 
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0)
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1)
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr1_out, tr1_err = tr1_proc.communicate()
+        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        return pickle.loads(tr0_out), pickle.loads(
+            tr1_out), tr0_proc.pid, tr1_proc.pid
+
+    def check_with_place(self,
+                         model_file,
+                         col_type,
+                         backend="nccl",
+                         path_id="0",
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_eager_delete_tensor_gb": "0.0",
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
+            "GLOG_v": "0",
+            "NCCL_P2P_DISABLE": "1",
+            "BACKEND": backend,
+            "PATH_ID": path_id
+        }
+        required_envs.update(need_envs)
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
+                                                         required_envs)
+        np.random.seed(pid0)
+        input1 = np.random.random((10, 1000))
+        np.random.seed(pid1)
+        input2 = np.random.random((10, 1000))
+        if col_type == "allgather":
+            need_result = np.vstack((input1, input2))
+            tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
+            tr_out1 = np.vstack((tr1_out[0], tr1_out[1]))
+            self.assertTrue(np.allclose(tr_out0, need_result))
+            self.assertTrue(np.allclose(tr_out1, need_result))
+        elif col_type == "broadcast":
+            need_result = input2
+            self.assertTrue(np.allclose(tr0_out, need_result))
+            self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "reduce":
+            need_result = input1 + input2
+            self.assertTrue(np.allclose(tr0_out, need_result))
+        elif col_type == "scatter":
+            need_result = input2
+            need_result1 = need_result[0:need_result.shape[0] // 2]
+            need_result2 = need_result[need_result.shape[0] // 2:]
+            self.assertTrue(np.allclose(tr0_out, need_result1))
+            self.assertTrue(np.allclose(tr1_out, need_result2))
+        elif col_type == "allreduce":
+            need_result = input1 + input2
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        else:
+            pass
diff --git a/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py b/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebf86f6ae14f1ecbdb3711378c84a3c1ce4967fb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveBarrierAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_barrier_nccl(self):
+        self.check_with_place("collective_barrier_api.py", "barrier", "nccl")
+
+    def test_barrier_gloo(self):
+        self.check_with_place("collective_barrier_api.py", "barrier", "gloo",
+                              "5")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_base.py b/python/paddle/fluid/tests/unittests/test_collective_base.py
index 3f3a5642abc242e994844d0aac1b79cbf664e4d4..512b2967e02fd01e67f416c2fd9222ae8589d8d8 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_base.py
@@ -241,6 +241,15 @@ class TestDistBase(unittest.TestCase):
             need_result = input2
             self.assertTrue(np.allclose(tr0_out, need_result))
             self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "reduce":
+            need_result = input1 + input2
+            self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "scatter":
+            need_result = input2
+            need_result1 = need_result[0:need_result.shape[0] // 2]
+            need_result2 = need_result[need_result.shape[0] // 2:]
+            self.assertTrue(np.allclose(tr0_out, need_result1))
+            self.assertTrue(np.allclose(tr1_out, need_result2))
         elif col_type == "allreduce":
             need_result = input1 + input2
             self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1cf4f1ac4c822ad578f5ee0e0268324de5e5e25
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveBroadcastAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_broadcast_nccl(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "nccl")
+
+    def test_broadcast_gloo(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "gloo", "0")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce.py b/python/paddle/fluid/tests/unittests/test_collective_reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..36837d6a227febd02e6ef1e2aeb905de19ca8acc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_base import TestDistBase
+
+
+class TestCReduceOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_reduce(self):
+        self.check_with_place("collective_reduce_op.py", "reduce")
+
+    def test_reduce_calc_stream(self):
+        self.check_with_place("collective_reduce_op_calc_stream.py", "reduce")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf3975f3fc1c6959ffbb28a51543ebfef00c52e5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveReduceAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_reduce_nccl(self):
+        self.check_with_place("collective_reduce_api.py", "reduce", "nccl")
+
+    def test_reduce_gloo(self):
+        self.check_with_place("collective_reduce_api.py", "reduce", "gloo", "1")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter.py b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fe3ce73359559c0f9b4e0e3990032ce693aab8a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
@@ -0,0 +1,31 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_base import TestDistBase
+
+
+class TestCScatterOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_scatter(self):
+        self.check_with_place("collective_scatter_op.py", "scatter")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cae842b396111f004b7ce52ce3f40c20ebe57263
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveScatterAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_scatter_gloo(self):
+        self.check_with_place("collective_scatter_api.py", "scatter", "gloo",
+                              "4")
+
+    def test_scatter_nccl(self):
+        self.check_with_place("collective_scatter_api.py", "scatter", "nccl")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index b277047500fb8082bd7a93998f424902bd563be6..30207340a27db0c1d00ab982cbac716e4b639c7e 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -25,7 +25,7 @@ import numpy
 import paddle
 import paddle.fluid as fluid
 
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.distributed.fleet as fleet
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_sync.py b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
index e6db5c4d8c1357c4b6bde87101a29502c14c37d3..c0044d9d620796057cce0e3a51b2dec2878a0e17 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
@@ -21,7 +21,7 @@ import os
 import paddle
 import paddle.fluid as fluid
 
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.distributed.fleet as fleet
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index 99d0c77fce50ffdfae8a3ec11fce42ea7942c5e6..cfad50409802d4f3d35c9da3b22597c681da91b1 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -93,8 +93,9 @@ def create_paddle_case(op_type, callback):
 
         def test_broadcast_api_1(self):
             with program_guard(Program(), Program()):
-                x = paddle.nn.data(name='x', shape=[1, 2, 1, 3], dtype='int32')
-                y = paddle.nn.data(name='y', shape=[1, 2, 3], dtype='int32')
+                x = paddle.static.data(
+                    name='x', shape=[1, 2, 1, 3], dtype='int32')
+                y = paddle.static.data(name='y', shape=[1, 2, 3], dtype='int32')
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 exe = paddle.static.Executor(self.place)
diff --git a/python/paddle/fluid/tests/unittests/test_compiled_program.py b/python/paddle/fluid/tests/unittests/test_compiled_program.py
index 8430f39578047facdeae6535db134152a7038717..751fed2e56126909d36670b56afbeee9bf2a694f 100644
--- a/python/paddle/fluid/tests/unittests/test_compiled_program.py
+++ b/python/paddle/fluid/tests/unittests/test_compiled_program.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from test_imperative_base import new_program_scope
@@ -29,8 +30,8 @@ class TestCompiledProgram(unittest.TestCase):
         self.label = np.random.randint(
             low=0, high=10, size=[16, 1], dtype=np.int64)
         with new_program_scope():
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -46,8 +47,8 @@ class TestCompiledProgram(unittest.TestCase):
 
     def test_compiled_program_base(self):
         with new_program_scope():
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -64,8 +65,8 @@ class TestCompiledProgram(unittest.TestCase):
 
     def test_compiled_program_with_data_parallel(self):
         with new_program_scope():
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
             exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..da527b26bf0608da5a648d92b492ff27cf2802f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import paddle.fluid.initializer as I
+import unittest
+
+
+class Conv1dTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 batch_size=4,
+                 spartial_shape=(16, ),
+                 num_channels=6,
+                 num_filters=8,
+                 filter_size=3,
+                 padding=0,
+                 padding_mode="zeros",
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 no_bias=False,
+                 dtype="float32",
+                 data_format="NCL"):
+        super(Conv1dTestCase, self).__init__(methodName)
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_filters = num_filters
+        self.spartial_shape = spartial_shape
+        self.filter_size = filter_size
+        self.data_format = data_format
+        self.channel_last = (self.data_format == "NHWC")
+
+        self.padding = padding
+        self.padding_mode = padding_mode
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.no_bias = no_bias
+        self.dtype = dtype
+
+    def setUp(self):
+        input_shape = (self.batch_size, self.num_channels
+                       ) + self.spartial_shape if not self.channel_last else (
+                           self.batch_size, ) + self.spartial_shape + (
+                               self.num_channels, )
+        self.input = np.random.randn(*input_shape).astype(self.dtype)
+
+        if isinstance(self.filter_size, int):
+            filter_size = [self.filter_size]
+        else:
+            filter_size = self.filter_size
+        self.weight_shape = weight_shape = (self.num_filters, self.num_channels
+                                            // self.groups) + tuple(filter_size)
+        self.weight = np.random.uniform(
+            -1, 1, size=weight_shape).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(
+                -1, 1, size=(self.num_filters, )).astype(self.dtype)
+        else:
+            self.bias = None
+
+    def functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                input_shape = (-1, self.num_channels,
+                               -1) if not self.channel_last else (
+                                   -1, -1, self.num_channels)
+                x_var = fluid.data("input", input_shape, dtype=self.dtype)
+                w_var = fluid.data(
+                    "weight", self.weight_shape, dtype=self.dtype)
+                b_var = fluid.data(
+                    "bias", (self.num_filters, ), dtype=self.dtype)
+                y_var = F.conv1d(
+                    x_var,
+                    w_var,
+                    b_var if not self.no_bias else None,
+                    padding=self.padding,
+                    stride=self.stride,
+                    dilation=self.dilation,
+                    groups=self.groups,
+                    data_format=self.data_format)
+        feed_dict = {"input": self.input, "weight": self.weight}
+        if self.bias is not None:
+            feed_dict["bias"] = self.bias
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def paddle_nn_layer(self):
+        x_var = paddle.to_tensor(self.input)
+        conv = nn.Conv1d(
+            self.num_channels,
+            self.num_filters,
+            self.filter_size,
+            padding=self.padding,
+            padding_mode=self.padding_mode,
+            stride=self.stride,
+            dilation=self.dilation,
+            groups=self.groups,
+            data_format=self.data_format)
+        conv.weight.set_value(self.weight)
+        if not self.no_bias:
+            conv.bias.set_value(self.bias)
+        y_var = conv(x_var)
+        y_np = y_var.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        result1 = self.functional(place)
+        with dg.guard(place):
+            result2 = self.paddle_nn_layer()
+        np.testing.assert_array_almost_equal(result1, result2)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+class Conv1dErrorTestCase(Conv1dTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(ValueError):
+                self.paddle_nn_layer()
+
+
+def add_cases(suite):
+    suite.addTest(Conv1dTestCase(methodName='runTest'))
+    suite.addTest(Conv1dTestCase(methodName='runTest', stride=[1], dilation=2))
+    suite.addTest(Conv1dTestCase(methodName='runTest', stride=2, dilation=(1)))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest', padding="same", no_bias=True))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest', filter_size=3, padding='valid'))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest', padding=2, data_format='NLC'))
+    suite.addTest(Conv1dTestCase(methodName='runTest', padding=[1]))
+    suite.addTest(Conv1dTestCase(methodName='runTest', padding=2))
+    suite.addTest(Conv1dTestCase(methodName='runTest'))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest', groups=2, padding="valid"))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest',
+            num_filters=6,
+            num_channels=3,
+            groups=3,
+            padding="valid",
+            data_format='NLC'))
+
+
+def add_error_cases(suite):
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', padding_mode="reflect", padding="valid"))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', data_format="VALID"))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', padding_mode="VALID"))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', num_channels=5, groups=2))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', num_filters=8, num_channels=15, groups=3))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', padding=[1, 2, 3, 4, 5]))
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..73227dd3610376d85fcfc70bb2653dfd927427fd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import paddle.fluid.initializer as I
+import unittest
+
+
+class ConvTranspose1dTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 batch_size=4,
+                 spartial_shape=16,
+                 in_channels=6,
+                 out_channels=8,
+                 filter_size=3,
+                 output_size=None,
+                 padding=0,
+                 output_padding=0,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 no_bias=False,
+                 data_format="NCL",
+                 dtype="float32"):
+        super(ConvTranspose1dTestCase, self).__init__(methodName)
+        self.batch_size = batch_size
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.spartial_shape = spartial_shape
+        self.filter_size = filter_size
+        self.output_size = output_size
+
+        self.padding = padding
+        self.output_padding = output_padding
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.no_bias = no_bias
+        self.data_format = data_format
+        self.dtype = dtype
+
+    def setUp(self):
+
+        self.channel_last = False if self.data_format == "NCL" else True
+        input_shape = (self.batch_size, self.in_channels,
+                       self.spartial_shape) if not self.channel_last else (
+                           self.batch_size,
+                           self.spartial_shape,
+                           self.in_channels, )
+        self.input = np.random.randn(*input_shape).astype(self.dtype)
+
+        if isinstance(self.filter_size, int):
+            filter_size = [self.filter_size]
+        else:
+            filter_size = self.filter_size
+        self.weight_shape = weight_shape = (self.in_channels, self.out_channels
+                                            // self.groups) + tuple(filter_size)
+        self.weight = np.random.uniform(
+            -1, 1, size=weight_shape).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(
+                -1, 1, size=(self.out_channels, )).astype(self.dtype)
+        else:
+            self.bias = None
+
+    def functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                input_shape = (-1, self.in_channels,
+                               -1) if not self.channel_last else (
+                                   -1, -1, self.in_channels)
+                x_var = fluid.data("input", input_shape, dtype=self.dtype)
+                w_var = fluid.data(
+                    "weight", self.weight_shape, dtype=self.dtype)
+                b_var = fluid.data(
+                    "bias", (self.out_channels, ), dtype=self.dtype)
+                y_var = F.conv_transpose1d(
+                    x_var,
+                    w_var,
+                    None if self.no_bias else b_var,
+                    output_size=self.output_size,
+                    padding=self.padding,
+                    output_padding=self.output_padding,
+                    stride=self.stride,
+                    dilation=self.dilation,
+                    groups=self.groups,
+                    data_format=self.data_format)
+        feed_dict = {"input": self.input, "weight": self.weight}
+        if self.bias is not None:
+            feed_dict["bias"] = self.bias
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def paddle_nn_layer(self):
+        x_var = paddle.to_tensor(self.input)
+        conv = nn.ConvTranspose1d(
+            self.in_channels,
+            self.out_channels,
+            self.filter_size,
+            padding=self.padding,
+            output_padding=self.output_padding,
+            stride=self.stride,
+            dilation=self.dilation,
+            groups=self.groups,
+            data_format=self.data_format)
+        conv.weight.set_value(self.weight)
+        if not self.no_bias:
+            conv.bias.set_value(self.bias)
+        y_var = conv(x_var, output_size=self.output_size)
+        y_np = y_var.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        result1 = self.functional(place)
+        with dg.guard(place):
+            result2 = self.paddle_nn_layer()
+        np.testing.assert_array_almost_equal(result1, result2)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+class ConvTranspose1dErrorTestCase(ConvTranspose1dTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(ValueError):
+                self.paddle_nn_layer()
+
+
+def add_cases(suite):
+    suite.addTest(ConvTranspose1dTestCase(methodName='runTest'))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', stride=[2], no_bias=True, dilation=2))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest',
+            filter_size=(3),
+            output_size=[36],
+            stride=[2],
+            dilation=2))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', stride=2, dilation=(2)))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', padding="valid"))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', padding='valid'))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', filter_size=1, padding=3))
+    suite.addTest(ConvTranspose1dTestCase(methodName='runTest', padding=[2]))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', data_format="NLC"))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', groups=2, padding="valid"))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest',
+            out_channels=6,
+            in_channels=3,
+            groups=3,
+            padding="valid"))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest',
+            data_format="NLC",
+            spartial_shape=16,
+            output_size=18))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', data_format="NLC", stride=3,
+            output_padding=2))
+
+
+def add_error_cases(suite):
+    suite.addTest(
+        ConvTranspose1dErrorTestCase(
+            methodName='runTest', data_format="not_valid"))
+    suite.addTest(
+        ConvTranspose1dErrorTestCase(
+            methodName='runTest', in_channels=5, groups=2))
+    suite.addTest(
+        ConvTranspose1dErrorTestCase(
+            methodName='runTest', stride=2, output_padding=3))
+    suite.addTest(
+        ConvTranspose1dErrorTestCase(
+            methodName='runTest', output_size="not_valid"))
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
index 64653ce2e7b8630030094b4004ecb17d56d3ff43..6bfe2aca530ddea6b49f12ad34dd9672e2a99ab5 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
@@ -20,6 +20,10 @@ import paddle.fluid.initializer as I
 import unittest
 
 
+def _reverse_repeat_list(t, n):
+    return list(x for x in reversed(t) for _ in range(n))
+
+
 class Conv2DTestCase(unittest.TestCase):
     def __init__(self,
                  methodName='runTest',
@@ -29,12 +33,11 @@ class Conv2DTestCase(unittest.TestCase):
                  num_filters=8,
                  filter_size=3,
                  padding=0,
+                 padding_mode='zeros',
                  stride=1,
                  dilation=1,
                  groups=1,
-                 act=None,
                  no_bias=False,
-                 use_cudnn=True,
                  data_format="NCHW",
                  dtype="float32"):
         super(Conv2DTestCase, self).__init__(methodName)
@@ -45,12 +48,16 @@ class Conv2DTestCase(unittest.TestCase):
         self.filter_size = filter_size
 
         self.padding = padding
+        if padding_mode in {'reflect', 'replicate', 'circular'}:
+            _paired_padding = fluid.layers.utils.convert_to_list(padding, 2,
+                                                                 'padding')
+            self._reversed_padding_repeated_twice = _reverse_repeat_list(
+                _paired_padding, 2)
+        self.padding_mode = padding_mode
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
-        self.act = act
         self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
         self.data_format = data_format
         self.dtype = dtype
 
@@ -91,19 +98,27 @@ class Conv2DTestCase(unittest.TestCase):
                     bias_attr = False
                 else:
                     bias_attr = I.NumpyArrayInitializer(self.bias)
+                if self.padding_mode != 'zeros':
+                    x_var = F.pad(x_var,
+                                  self._reversed_padding_repeated_twice,
+                                  mode=self.padding_mode,
+                                  data_format=self.data_format)
+                    padding = 0
+                else:
+                    padding = self.padding
+
                 y_var = fluid.layers.conv2d(
                     x_var,
                     self.num_filters,
                     self.filter_size,
-                    padding=self.padding,
+                    padding=padding,
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
                     param_attr=weight_attr,
                     bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
+
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
         exe.run(start)
@@ -122,16 +137,24 @@ class Conv2DTestCase(unittest.TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 b_var = fluid.data(
                     "bias", (self.num_filters, ), dtype=self.dtype)
+
+                if self.padding_mode != 'zeros':
+                    x_var = F.pad(x_var,
+                                  self._reversed_padding_repeated_twice,
+                                  mode=self.padding_mode,
+                                  data_format=self.data_format)
+                    padding = 0
+                else:
+                    padding = self.padding
+
                 y_var = F.conv2d(
                     x_var,
                     w_var,
                     b_var if not self.no_bias else None,
-                    padding=self.padding,
+                    padding=padding,
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
                     data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
@@ -143,18 +166,16 @@ class Conv2DTestCase(unittest.TestCase):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv2D(
+        conv = nn.Conv2d(
             self.num_channels,
             self.num_filters,
             self.filter_size,
             padding=self.padding,
+            padding_mode=self.padding_mode,
             stride=self.stride,
             dilation=self.dilation,
             groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
+            data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
@@ -198,7 +219,7 @@ def add_cases(suite):
             methodName='runTest', stride=2, dilation=(2, 1)))
     suite.addTest(
         Conv2DTestCase(
-            methodName='runTest', padding="same", no_bias=True, act="sigmoid"))
+            methodName='runTest', padding="same", no_bias=True))
     suite.addTest(
         Conv2DTestCase(
             methodName='runTest', filter_size=(3, 3), padding='valid'))
@@ -222,15 +243,28 @@ def add_cases(suite):
             num_filters=6,
             num_channels=3,
             groups=3,
-            use_cudnn=False,
-            act="sigmoid",
             padding="valid"))
+    suite.addTest(
+        Conv2DTestCase(
+            methodName='runTest',
+            filter_size=(3, 3),
+            padding=1,
+            padding_mode='reflect'))
+    suite.addTest(
+        Conv2DTestCase(
+            methodName='runTest',
+            filter_size=(3, 3),
+            padding=1,
+            padding_mode='replicate'))
+    suite.addTest(
+        Conv2DTestCase(
+            methodName='runTest',
+            filter_size=(3, 3),
+            padding=1,
+            padding_mode='circular'))
 
 
 def add_error_cases(suite):
-    suite.addTest(
-        Conv2DErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
     suite.addTest(
         Conv2DErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
index 989836d5993af5620a7b5fbd86c07b028e419fc4..ba450b345b8a309f5d7ff1e7a5c149809f55f46c 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
@@ -29,13 +29,12 @@ class Conv2DTransposeTestCase(unittest.TestCase):
                  num_filters=8,
                  filter_size=3,
                  output_size=None,
+                 output_padding=0,
                  padding=0,
                  stride=1,
                  dilation=1,
                  groups=1,
-                 act=None,
                  no_bias=False,
-                 use_cudnn=True,
                  data_format="NCHW",
                  dtype="float32"):
         super(Conv2DTransposeTestCase, self).__init__(methodName)
@@ -45,14 +44,13 @@ class Conv2DTransposeTestCase(unittest.TestCase):
         self.spartial_shape = spartial_shape
         self.filter_size = filter_size
         self.output_size = output_size
+        self.output_padding = output_padding
 
         self.padding = padding
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
-        self.act = act
         self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
         self.data_format = data_format
         self.dtype = dtype
 
@@ -93,6 +91,7 @@ class Conv2DTransposeTestCase(unittest.TestCase):
                     bias_attr = False
                 else:
                     bias_attr = I.NumpyArrayInitializer(self.bias)
+
                 y_var = fluid.layers.conv2d_transpose(
                     x_var,
                     self.num_filters,
@@ -104,8 +103,6 @@ class Conv2DTransposeTestCase(unittest.TestCase):
                     groups=self.groups,
                     param_attr=weight_attr,
                     bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
@@ -125,17 +122,22 @@ class Conv2DTransposeTestCase(unittest.TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 b_var = fluid.data(
                     "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv2d_transpose(
+
+                if self.output_padding != 0:
+                    output_size = None
+                else:
+                    output_size = self.output_size
+
+                y_var = F.conv_transpose2d(
                     x_var,
                     w_var,
                     None if self.no_bias else b_var,
-                    output_size=self.output_size,
+                    output_size=output_size,
                     padding=self.padding,
+                    output_padding=self.output_padding,
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
                     data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
@@ -147,32 +149,38 @@ class Conv2DTransposeTestCase(unittest.TestCase):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv2DTranspose(
+
+        if self.output_padding != 0:
+            output_size = None
+        else:
+            output_size = self.output_size
+
+        conv = nn.ConvTranspose2d(
             self.num_channels,
             self.num_filters,
             self.filter_size,
-            output_size=self.output_size,
             padding=self.padding,
+            output_padding=self.output_padding,
             stride=self.stride,
             dilation=self.dilation,
             groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
+            data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
-        y_var = conv(x_var)
+        y_var = conv(x_var, output_size)
         y_np = y_var.numpy()
         return y_np
 
     def _test_equivalence(self, place):
         place = fluid.CPUPlace()
+
         result1 = self.fluid_layer(place)
         result2 = self.functional(place)
+
         with dg.guard(place):
             result3 = self.paddle_nn_layer()
+
         np.testing.assert_array_almost_equal(result1, result2)
         np.testing.assert_array_almost_equal(result2, result3)
 
@@ -194,7 +202,7 @@ class Conv2DTransposeErrorTestCase(Conv2DTransposeTestCase):
 
 
 def add_cases(suite):
-    suite.addTest(Conv2DTransposeTestCase(methodName='runTest', act="relu"))
+    suite.addTest(Conv2DTransposeTestCase(methodName='runTest'))
     suite.addTest(
         Conv2DTransposeTestCase(
             methodName='runTest', stride=[1, 2], no_bias=True, dilation=2))
@@ -211,9 +219,6 @@ def add_cases(suite):
     suite.addTest(
         Conv2DTransposeTestCase(
             methodName='runTest', padding="valid"))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', padding='valid'))
     suite.addTest(
         Conv2DTransposeTestCase(
             methodName='runTest', filter_size=1, padding=(2, 3)))
@@ -240,15 +245,22 @@ def add_cases(suite):
             num_filters=6,
             num_channels=3,
             groups=3,
-            use_cudnn=False,
-            act="sigmoid",
             padding="valid"))
+    suite.addTest(
+        Conv2DTransposeTestCase(
+            methodName='runTest',
+            num_filters=6,
+            num_channels=3,
+            spartial_shape=(7, 7),
+            filter_size=[5, 5],
+            groups=1,
+            padding=2,
+            stride=2,
+            output_size=[14, 14],
+            output_padding=[1, 1], ))
 
 
 def add_error_cases(suite):
-    suite.addTest(
-        Conv2DTransposeErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
     suite.addTest(
         Conv2DTransposeErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index f4418150e8a69d795ff544073b6ba6dd7431e44b..913db51da500b6c324abfab61744dfc1947bf7a5 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -77,8 +77,13 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
         output_size = attrs['output_size']
         out_h = output_size[0] + pad_h_0 + pad_h_1
         out_w = output_size[1] + pad_w_0 + pad_w_1
-
-    out = np.zeros((in_n, out_c, out_h, out_w), dtype=input_.dtype)
+    out_pad_h = 0
+    out_pad_w = 0
+    if 'output_padding' in attrs:
+        out_pad_h = attrs['output_padding'][0]
+        out_pad_w = attrs['output_padding'][1]
+    out = np.zeros(
+        (in_n, out_c, out_h + out_pad_h, out_w + out_pad_w), dtype=input_.dtype)
 
     for n in range(in_n):
         for i in range(in_h):
@@ -99,7 +104,8 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
                         out[n, g * f_out_c + k, i1:i2:dilations[0], j1:j2:
                             dilations[1]] += tmp_out
 
-    out = out[:, :, pad_h_0:out_h - pad_h_1, pad_w_0:out_w - pad_w_1]
+    out = out[:, :, pad_h_0:out_h - pad_h_1 + out_pad_h, pad_w_0:out_w - pad_w_1
+              + out_pad_w]
     if attrs['data_format'] == 'NHWC':
         out = np.transpose(out, [0, 2, 3, 1])
     return out
@@ -114,6 +120,7 @@ class TestConv2dTransposeOp(OpTest):
         self.use_cudnn = False
         self.use_mkldnn = False
         self.output_size = None
+        self.output_padding = []
         self.data_format = "NCHW"
         self.pad = [0, 0]
         self.padding_algorithm = "EXPLICIT"
@@ -138,6 +145,9 @@ class TestConv2dTransposeOp(OpTest):
         if self.output_size is not None:
             self.attrs['output_size'] = self.output_size
 
+        if len(self.output_padding) > 0:
+            self.attrs['output_padding'] = self.output_padding
+
         output = conv2dtranspose_forward_naive(input_, filter_,
                                                self.attrs).astype(self.dtype)
 
@@ -290,6 +300,18 @@ class TestWithEvenUpsample(TestConv2dTransposeOp):
         self.filter_size = [f_c, 6, 5, 5]
 
 
+class TestWithEvenUpsampleOutputPadding(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [2, 2]
+        self.groups = 1
+        self.dilations = [1, 1]
+        self.output_padding = [1, 1]
+        self.input_size = [2, 3, 7, 7]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 5, 5]
+
+
 class Test_NHWC(TestConv2dTransposeOp):
     def init_test_case(self):
         self.pad = [0, 0]
@@ -375,6 +397,19 @@ class TestWithEvenUpsample_NHWC(TestConv2dTransposeOp):
         self.data_format = 'NHWC'
 
 
+class TestWithEvenUpsample_NHWC_output_padding(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [2, 2]
+        self.groups = 1
+        self.dilations = [1, 1]
+        self.output_padding = [1, 1]
+        self.input_size = [2, 7, 7, 3]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 5, 5]
+        self.data_format = 'NHWC'
+
+
 # ------------ test_cudnn ------------
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
index cf582c6210b76c6546de6d09d9219dbf4005bb17..56355a1c95e0396d0dec53cae02c3a99bf874013 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
@@ -32,9 +32,7 @@ class Conv3DTestCase(unittest.TestCase):
                  stride=1,
                  dilation=1,
                  groups=1,
-                 act=None,
                  no_bias=False,
-                 use_cudnn=True,
                  data_format="NCDHW",
                  dtype="float32"):
         super(Conv3DTestCase, self).__init__(methodName)
@@ -48,9 +46,7 @@ class Conv3DTestCase(unittest.TestCase):
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
-        self.act = act
         self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
         self.data_format = data_format
         self.dtype = dtype
 
@@ -101,8 +97,6 @@ class Conv3DTestCase(unittest.TestCase):
                     groups=self.groups,
                     param_attr=weight_attr,
                     bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
@@ -130,8 +124,6 @@ class Conv3DTestCase(unittest.TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
                     data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
@@ -143,7 +135,7 @@ class Conv3DTestCase(unittest.TestCase):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv3D(
+        conv = nn.Conv3d(
             self.num_channels,
             self.num_filters,
             self.filter_size,
@@ -151,10 +143,7 @@ class Conv3DTestCase(unittest.TestCase):
             stride=self.stride,
             dilation=self.dilation,
             groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
+            data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
@@ -225,15 +214,10 @@ def add_cases(suite):
             num_filters=6,
             num_channels=3,
             groups=3,
-            use_cudnn=False,
-            act="sigmoid",
             padding="valid"))
 
 
 def add_error_cases(suite):
-    suite.addTest(
-        Conv3DErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
     suite.addTest(
         Conv3DErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
index acaf33467dbfc1c580ab3a36f08d0c2a26d7c239..e30f0cd3ecd0b872efa53c85e0666e4a6fb00a88 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
@@ -33,9 +33,7 @@ class Conv3DTransposeTestCase(unittest.TestCase):
                  stride=1,
                  dilation=1,
                  groups=1,
-                 act=None,
                  no_bias=False,
-                 use_cudnn=True,
                  data_format="NCDHW",
                  dtype="float32"):
         super(Conv3DTransposeTestCase, self).__init__(methodName)
@@ -50,9 +48,7 @@ class Conv3DTransposeTestCase(unittest.TestCase):
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
-        self.act = act
         self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
         self.data_format = data_format
         self.dtype = dtype
 
@@ -104,8 +100,6 @@ class Conv3DTransposeTestCase(unittest.TestCase):
                     groups=self.groups,
                     param_attr=weight_attr,
                     bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
@@ -125,7 +119,7 @@ class Conv3DTransposeTestCase(unittest.TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 b_var = fluid.data(
                     "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv3d_transpose(
+                y_var = F.conv_transpose3d(
                     x_var,
                     w_var,
                     None if self.no_bias else b_var,
@@ -134,8 +128,6 @@ class Conv3DTransposeTestCase(unittest.TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
                     data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
@@ -147,23 +139,19 @@ class Conv3DTransposeTestCase(unittest.TestCase):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv3DTranspose(
+        conv = nn.ConvTranspose3d(
             self.num_channels,
             self.num_filters,
             self.filter_size,
-            output_size=self.output_size,
             padding=self.padding,
             stride=self.stride,
             dilation=self.dilation,
             groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
+            data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
-        y_var = conv(x_var)
+        y_var = conv(x_var, self.output_size)
         y_np = y_var.numpy()
         return y_np
 
@@ -194,7 +182,7 @@ class Conv3DTransposeErrorTestCase(Conv3DTransposeTestCase):
 
 
 def add_cases(suite):
-    suite.addTest(Conv3DTransposeTestCase(methodName='runTest', act="tanh"))
+    suite.addTest(Conv3DTransposeTestCase(methodName='runTest'))
     suite.addTest(
         Conv3DTransposeTestCase(
             methodName='runTest', stride=[1, 2, 1], dilation=2, no_bias=True))
@@ -240,15 +228,10 @@ def add_cases(suite):
             num_filters=6,
             num_channels=3,
             groups=3,
-            use_cudnn=False,
-            act="sigmoid",
             padding="valid"))
 
 
 def add_error_cases(suite):
-    suite.addTest(
-        Conv3DTransposeErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
     suite.addTest(
         Conv3DTransposeErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
diff --git a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e25613fa63da440f71f23841095f153e61735e9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
@@ -0,0 +1,140 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.fluid.core as core
+
+from paddle.fluid import Program, program_guard, Executor, default_main_program
+
+
+class TestCosineSimilarityAPI(unittest.TestCase):
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def _get_numpy_out(self, x1, x2, axis=1, eps=1e-8):
+        w12 = np.sum(x1 * x2, axis=axis)
+        w1 = np.sum(x1 * x1, axis=axis)
+        w2 = np.sum(x2 * x2, axis=axis)
+        n12 = np.sqrt(np.clip(w1 * w2, eps * eps, None))
+        cos_sim = w12 / n12
+        return cos_sim
+
+    def check_static_result(self, place):
+        paddle.enable_static()
+
+        with program_guard(Program(), Program()):
+            shape = [10, 15]
+            axis = 1
+            eps = 1e-8
+            np.random.seed(0)
+            np_x1 = np.random.rand(*shape).astype(np.float32)
+            np_x2 = np.random.rand(*shape).astype(np.float32)
+
+            x1 = paddle.data(name="x1", shape=shape)
+            x2 = paddle.data(name="x2", shape=shape)
+            result = F.cosine_similarity(x1, x2, axis=axis, eps=eps)
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x1": np_x1,
+                                    "x2": np_x2},
+                              fetch_list=[result])
+
+            np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+            self.assertTrue(np.allclose(fetches[0], np_out))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph_1(self):
+        paddle.disable_static()
+
+        shape = [10, 15]
+        axis = 1
+        eps = 1e-8
+        np.random.seed(1)
+        np_x1 = np.random.rand(*shape).astype(np.float32)
+        np_x2 = np.random.rand(*shape).astype(np.float32)
+        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+
+        tesnor_x1 = paddle.to_variable(np_x1)
+        tesnor_x2 = paddle.to_variable(np_x2)
+        y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
+
+        self.assertTrue(np.allclose(y.numpy(), np_out))
+
+    def test_dygraph_2(self):
+        paddle.disable_static()
+
+        shape = [12, 13]
+        axis = 0
+        eps = 1e-6
+        np.random.seed(1)
+        np_x1 = np.random.rand(*shape).astype(np.float32)
+        np_x2 = np.random.rand(*shape).astype(np.float32)
+        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+
+        tesnor_x1 = paddle.to_variable(np_x1)
+        tesnor_x2 = paddle.to_variable(np_x2)
+        y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
+
+        self.assertTrue(np.allclose(y.numpy(), np_out))
+
+    def test_dygraph_3(self):
+        paddle.disable_static()
+
+        shape1 = [10, 12, 10]
+        shape2 = [10, 1, 10]
+        axis = 2
+        eps = 1e-6
+        np.random.seed(1)
+        np_x1 = np.random.rand(*shape1).astype(np.float32)
+        np_x2 = np.random.rand(*shape2).astype(np.float32)
+        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+
+        tesnor_x1 = paddle.to_variable(np_x1)
+        tesnor_x2 = paddle.to_variable(np_x2)
+        y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
+
+        self.assertTrue(np.allclose(y.numpy(), np_out))
+
+    def test_dygraph_4(self):
+        paddle.disable_static()
+
+        shape1 = [23, 12, 1]
+        shape2 = [23, 1, 10]
+        axis = 2
+        eps = 1e-6
+        np.random.seed(1)
+        np_x1 = np.random.rand(*shape1).astype(np.float32)
+        np_x2 = np.random.rand(*shape2).astype(np.float32)
+        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+
+        cos_sim_func = nn.CosineSimilarity(axis=axis, eps=eps)
+        tesnor_x1 = paddle.to_variable(np_x1)
+        tesnor_x2 = paddle.to_variable(np_x2)
+        y = cos_sim_func(tesnor_x1, tesnor_x2)
+
+        self.assertTrue(np.allclose(y.numpy(), np_out))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 7f667d6b71c7f52f6d5afb42045c2da0cc45587b..4982cd195820811b9a8ec3fe6d01955234032120 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -535,5 +535,443 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
 
+class FuncCrossEntropyLoss(unittest.TestCase):
+    #1
+    def test_cross_entropy_loss_1d_with_weight_mean(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        weight_np = np.random.random([200]).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(
+            input_np, label_np, weight=weight_np)[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #2
+    def test_cross_entropy_loss_1d_with_weight_sum(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        weight_np = np.random.random([200]).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='sum')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='sum')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(
+            input_np, label_np, weight=weight_np, reduction='sum')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #3
+    def test_cross_entropy_loss_1d_with_weight_none(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        weight_np = np.random.random([200]).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='none')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='none')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(
+            input_np, label_np, weight=weight_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #4
+    def test_cross_entropy_loss_1d_mean(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={'input': input_np,
+                                       'label': label_np},
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(input_np, label_np)[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #5
+    def test_cross_entropy_loss_1d_sum(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='sum')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={'input': input_np,
+                                       'label': label_np},
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='sum')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(input_np, label_np, reduction='sum')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #6
+    def test_cross_entropy_loss_1d_none(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='none')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={'input': input_np,
+                                       'label': label_np},
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='none')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(input_np, label_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #7
+    def test_cross_entropy_loss_2d_with_weight_none(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        weight_np = np.random.random(size=(3, )).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='none')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='none')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            input_np, label_np, weight=weight_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #8
+    def test_cross_entropy_loss_2d_with_weight_mean(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        weight_np = np.random.random(size=(3, )).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='mean')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='mean')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            input_np, label_np, weight=weight_np, reduction='mean')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #9
+    def test_cross_entropy_loss_2d_with_weight_sum(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        weight_np = np.random.random(size=(3, )).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='sum')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='sum')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            input_np, label_np, weight=weight_np, reduction='sum')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #10
+    def test_cross_entropy_loss_2d_none(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='none')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='none')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(input_np, label_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #11
+    def test_cross_entropy_loss_2d_mean(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='mean')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='mean')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            input_np, label_np, reduction='mean')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #12
+    def test_cross_entropy_loss_2d_sum(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='sum')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='sum')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(input_np, label_np, reduction='sum')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
index 57024e8ae5cd5dd2fbb231269fda50b5ef6e7a47..ad121fac8cc045e67cf116d2cf9cedd6ac9bef99 100644
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -54,7 +54,7 @@ class TestCumsumOp(unittest.TestCase):
     def run_static(self, use_gpu=False):
         with fluid.program_guard(fluid.Program()):
             data_np = np.random.random((100, 100)).astype(np.float32)
-            x = paddle.nn.data('X', [100, 100])
+            x = paddle.static.data('X', [100, 100])
             y = paddle.cumsum(x)
             y2 = paddle.cumsum(x, axis=0)
             y3 = paddle.cumsum(x, axis=-1)
@@ -100,7 +100,7 @@ class TestCumsumOp(unittest.TestCase):
 
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
-            x = paddle.nn.data('x', [3, 4])
+            x = paddle.static.data('x', [3, 4])
             y = paddle.cumsum(x, name='out')
             self.assertTrue('out' in y.name)
 
diff --git a/python/paddle/fluid/tests/unittests/test_data.py b/python/paddle/fluid/tests/unittests/test_data.py
index 22dc72048e429ed257e9d7d1213b6cb7dcafbf1a..8070148f8b36dd7dab7711abaf25994acebc7e6f 100644
--- a/python/paddle/fluid/tests/unittests/test_data.py
+++ b/python/paddle/fluid/tests/unittests/test_data.py
@@ -16,9 +16,11 @@ from __future__ import print_function
 
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import Program, program_guard
+import paddle.fluid.core as core
 
 
 class TestApiDataError(unittest.TestCase):
@@ -53,5 +55,49 @@ class TestApiDataError(unittest.TestCase):
             self.assertRaises(TypeError, test_shape_type)
 
 
+class TestApiStaticDataError(unittest.TestCase):
+    def test_fluid_dtype(self):
+        with program_guard(Program(), Program()):
+            x1 = paddle.static.data(name="x1", shape=[2, 25])
+            self.assertEqual(x1.dtype, core.VarDesc.VarType.FP32)
+
+            x2 = paddle.static.data(name="x2", shape=[2, 25], dtype="bool")
+            self.assertEqual(x2.dtype, core.VarDesc.VarType.BOOL)
+
+            paddle.set_default_dtype("float64")
+            x3 = paddle.static.data(name="x3", shape=[2, 25])
+            self.assertEqual(x3.dtype, core.VarDesc.VarType.FP64)
+
+    def test_fluid_data(self):
+        with program_guard(Program(), Program()):
+
+            # 1. The type of 'name' in fluid.data must be str.
+            def test_name_type():
+                paddle.static.data(name=1, shape=[2, 25], dtype="bool")
+
+            self.assertRaises(TypeError, test_name_type)
+
+            # 2. The type of 'shape' in fluid.data must be list or tuple.
+            def test_shape_type():
+                paddle.static.data(name='data1', shape=2, dtype="bool")
+
+            self.assertRaises(TypeError, test_shape_type)
+
+    def test_layers_data(self):
+        with program_guard(Program(), Program()):
+
+            # 1. The type of 'name' in layers.data must be str.
+            def test_name_type():
+                paddle.static.data(name=1, shape=[2, 25], dtype="bool")
+
+            self.assertRaises(TypeError, test_name_type)
+
+            # 2. The type of 'shape' in layers.data must be list or tuple.
+            def test_shape_type():
+                paddle.static.data(name='data1', shape=2, dtype="bool")
+
+            self.assertRaises(TypeError, test_shape_type)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_data_norm_op.py b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
index c766cf17f422205521641ae44ab2060b4ab6e81c..cefef9ff9183e34d1ae7ae3e9a2f88969bf094a6 100644
--- a/python/paddle/fluid/tests/unittests/test_data_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
@@ -271,7 +271,7 @@ class TestDataNormOpWithEnableScaleAndShift(OpTest):
         self.use_mkldnn = False
         epsilon = 0.00001
         slot_dim = -1
-        enable_scale_and_shitf = True
+        enable_scale_and_shift = True
         x_shape = [2, 50]
         scale_shape = [50]
         tp = np.float32
@@ -319,6 +319,63 @@ class TestDataNormOpWithEnableScaleAndShift(OpTest):
         self.check_grad(['X'], 'Y', no_grad_set=set([]))
 
 
+class TestDataNormOpWithoutEnableScaleAndShift(OpTest):
+    """
+    test class for data norm op
+    test forward and backward
+    """
+
+    def setUp(self):
+        """
+        init data norm op test env
+        """
+        self.op_type = 'data_norm'
+        self.use_mkldnn = False
+        epsilon = 0.00001
+        slot_dim = -1
+        enable_scale_and_shift = True
+        x_shape = [2, 50]
+        scale_shape = [50]
+        tp = np.float32
+
+        x_val = np.random.uniform(-1, 1, x_shape).astype(tp)
+        batch_size = np.ones(scale_shape).astype(tp)
+        batch_size *= 1e4
+        batch_sum = np.zeros(scale_shape).astype(tp)
+        batch_square_sum = np.ones(scale_shape).astype(tp)
+        batch_square_sum *= 1e4
+        scale_w = np.ones(scale_shape).astype(tp)
+        bias = np.zeros(scale_shape).astype(tp)
+
+        y = np.array(x_val)
+
+        mean = np.zeros(x_shape).astype(tp)
+        scale = np.ones(x_shape).astype(tp)
+
+        self.inputs = {
+            "X": x_val,
+            "BatchSize": batch_size,
+            "BatchSum": batch_sum,
+            "BatchSquareSum": batch_square_sum,
+            "scale_w": scale_w,
+            "bias": bias
+        }
+        self.outputs = {"Y": y, "Means": mean, "Scales": scale}
+        self.attrs = {"epsilon": epsilon, "use_mkldnn": self.use_mkldnn}
+
+    def test_check_output(self):
+        """
+        test check forward, check output
+        """
+        self.check_output()
+
+    def test_check_grad(self):
+        """
+        test check backward, check grad
+        """
+        self.check_grad(['X'], 'Y', no_grad_set=set([]))
+
+
 class TestDataNormOpWithEnableScaleAndShift_1(OpTest):
     """
     test class for data norm op
@@ -333,7 +390,7 @@ class TestDataNormOpWithEnableScaleAndShift_1(OpTest):
         self.use_mkldnn = False
         epsilon = 0.00001
         slot_dim = 1
-        enable_scale_and_shitf = True
+        enable_scale_and_shift = True
         x_shape = [2, 50]
         scale_shape = [50]
         tp = np.float32
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
index f8cb6170be945ed628440b5a068f1acd0ac26503..cc0f3745bbf7bb1fae0c0ac430491ac2e0d9b55f 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -34,10 +34,10 @@ def random_reader():
 
 
 def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
+    paddle.manual_seed(1)
+    paddle.framework.random._manual_program_seed(1)
     startup_prog = fluid.Program()
     main_prog = fluid.Program()
-    startup_prog.random_seed = 1
-    main_prog.random_seed = 1
 
     with fluid.unique_name.guard():
         with fluid.program_guard(main_prog, startup_prog):
@@ -122,14 +122,8 @@ class TestBase(unittest.TestCase):
                             label = item['label']
                             assert image.shape() == [BATCH_SIZE, 784]
                             assert label.shape() == [BATCH_SIZE, 1]
-                            if ps[i]._equals(fluid.CPUPlace()):
-                                assert image._place()._equals(fluid.CPUPlace())
-                                assert label._place()._equals(fluid.CPUPlace())
-                            else:
-                                assert image._place()._equals(
-                                    fluid.CUDAPinnedPlace())
-                                assert label._place()._equals(
-                                    fluid.CUDAPinnedPlace())
+                            assert image._place()._equals(ps[i])
+                            assert label._place()._equals(ps[i])
                         L, = exe.run(program=prog,
                                      feed=d,
                                      fetch_list=[loss],
diff --git a/python/paddle/fluid/tests/unittests/test_default_dtype.py b/python/paddle/fluid/tests/unittests/test_default_dtype.py
index eba4ec3420f2d5b56a880ce314b7168fb5e84154..057933fc7a735c2732cd651e83e99ddfa747b8a8 100644
--- a/python/paddle/fluid/tests/unittests/test_default_dtype.py
+++ b/python/paddle/fluid/tests/unittests/test_default_dtype.py
@@ -33,8 +33,28 @@ class TestDefaultType(unittest.TestCase):
         set_default_dtype("float64")
         self.assertEqual("float64", get_default_dtype())
 
-        set_default_dtype(np.int32)
-        self.assertEqual("int32", get_default_dtype())
+        set_default_dtype("float32")
+        self.assertEqual("float32", get_default_dtype())
+
+        set_default_dtype("float16")
+        self.assertEqual("float16", get_default_dtype())
+
+        set_default_dtype(np.float64)
+        self.assertEqual("float64", get_default_dtype())
+
+        set_default_dtype(np.float32)
+        self.assertEqual("float32", get_default_dtype())
+
+        set_default_dtype(np.float16)
+        self.assertEqual("float16", get_default_dtype())
+
+
+class TestRaiseError(unittest.TestCase):
+    def test_error(self):
+        self.assertRaises(TypeError, set_default_dtype, "int32")
+        self.assertRaises(TypeError, set_default_dtype, np.int32)
+        self.assertRaises(TypeError, set_default_dtype, "int64")
+        self.assertRaises(TypeError, set_default_dtype, np.int64)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_diag.py b/python/paddle/fluid/tests/unittests/test_diag.py
index b6566676d2533aad5272fe61dbedbc1d55ea213b..780d57b53310bb5f385a131d4ad52dd6f5e695f0 100644
--- a/python/paddle/fluid/tests/unittests/test_diag.py
+++ b/python/paddle/fluid/tests/unittests/test_diag.py
@@ -17,11 +17,181 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid import Program, program_guard
 
 
+class TestDiagV2Op(OpTest):
+    def setUp(self):
+        self.op_type = "diag_v2"
+        self.x = np.random.rand(10, 10)
+        self.offset = 0
+        self.padding_value = 0.0
+        self.out = np.diag(self.x, self.offset)
+
+        self.init_config()
+        self.inputs = {'X': self.x}
+        self.attrs = {
+            'offset': self.offset,
+            'padding_value': self.padding_value
+        }
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_config(self):
+        pass
+
+
+class TestDiagV2OpCase1(TestDiagV2Op):
+    def init_config(self):
+        self.offset = 1
+        self.out = np.diag(self.x, self.offset)
+
+
+class TestDiagV2OpCase2(TestDiagV2Op):
+    def init_config(self):
+        self.offset = -1
+        self.out = np.diag(self.x, self.offset)
+
+
+class TestDiagV2OpCase3(TestDiagV2Op):
+    def init_config(self):
+        self.x = np.random.randint(-10, 10, size=(10, 10))
+        self.out = np.diag(self.x, self.offset)
+
+
+class TestDiagV2OpCase4(TestDiagV2Op):
+    def init_config(self):
+        self.x = np.random.rand(100)
+        self.padding_value = 8
+        n = self.x.size
+        self.out = self.padding_value * np.ones((n, n)) + np.diag(
+            self.x, self.offset) - np.diag(self.padding_value * np.ones(n))
+
+
+class TestDiagV2Error(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_diag_v2_type():
+                x = [1, 2, 3]
+                output = paddle.diag(x)
+
+            self.assertRaises(TypeError, test_diag_v2_type)
+
+            x = paddle.static.data('data', [3, 3])
+            self.assertRaises(TypeError, paddle.diag, x, offset=2.5)
+
+            self.assertRaises(TypeError, paddle.diag, x, padding_value=[9])
+
+            x = paddle.static.data('data2', [3, 3, 3])
+            self.assertRaises(ValueError, paddle.diag, x)
+
+
+class TestDiagV2API(unittest.TestCase):
+    def setUp(self):
+        self.input_np = np.random.random(size=(10, 10)).astype(np.float32)
+        self.expected0 = np.diag(self.input_np)
+        self.expected1 = np.diag(self.input_np, k=1)
+        self.expected2 = np.diag(self.input_np, k=-1)
+
+        self.input_np2 = np.random.rand(100)
+        self.offset = 0
+        self.padding_value = 8
+        n = self.input_np2.size
+        self.expected3 = self.padding_value * np.ones(
+            (n, n)) + np.diag(self.input_np2, self.offset) - np.diag(
+                self.padding_value * np.ones(n))
+
+        self.input_np3 = np.random.randint(-10, 10, size=(100)).astype(np.int64)
+        self.padding_value = 8.0
+        n = self.input_np3.size
+        self.expected4 = self.padding_value * np.ones(
+            (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
+                self.padding_value * np.ones(n))
+
+        self.padding_value = -8
+        self.expected5 = self.padding_value * np.ones(
+            (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
+                self.padding_value * np.ones(n))
+
+    def run_imperative(self):
+        x = paddle.to_tensor(self.input_np)
+        y = paddle.diag(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected0))
+
+        y = paddle.diag(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected1))
+
+        y = paddle.diag(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected2))
+
+        x = paddle.to_tensor(self.input_np2)
+        y = paddle.diag(x, padding_value=8)
+        self.assertTrue(np.allclose(y.numpy(), self.expected3))
+
+        x = paddle.to_tensor(self.input_np3)
+        y = paddle.diag(x, padding_value=8.0)
+        self.assertTrue(np.allclose(y.numpy(), self.expected4))
+
+        y = paddle.diag(x, padding_value=-8)
+        self.assertTrue(np.allclose(y.numpy(), self.expected5))
+
+    def run_static(self, use_gpu=False):
+        x = paddle.data(name='input', shape=[10, 10], dtype='float32')
+        x2 = paddle.data(name='input2', shape=[100], dtype='float64')
+        x3 = paddle.data(name='input3', shape=[100], dtype='int64')
+        result0 = paddle.diag(x)
+        result1 = paddle.diag(x, offset=1)
+        result2 = paddle.diag(x, offset=-1)
+        result3 = paddle.diag(x, name='aaa')
+        result4 = paddle.diag(x2, padding_value=8)
+        result5 = paddle.diag(x3, padding_value=8.0)
+        result6 = paddle.diag(x3, padding_value=-8)
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        res0, res1, res2, res4, res5, res6 = exe.run(
+            feed={
+                "input": self.input_np,
+                "input2": self.input_np2,
+                'input3': self.input_np3
+            },
+            fetch_list=[result0, result1, result2, result4, result5, result6])
+
+        self.assertTrue(np.allclose(res0, self.expected0))
+        self.assertTrue(np.allclose(res1, self.expected1))
+        self.assertTrue(np.allclose(res2, self.expected2))
+        self.assertTrue('aaa' in result3.name)
+        self.assertTrue(np.allclose(res4, self.expected3))
+        self.assertTrue(np.allclose(res5, self.expected4))
+        self.assertTrue(np.allclose(res6, self.expected5))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static(use_gpu=True)
+
+
 class TestDiagOp(OpTest):
     def setUp(self):
         self.op_type = "diag"
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 4dc2c92ad918c269d5277da0c13d4a96182a253d..2919ec5e9ca97b1d59af46a54b2d702cb6de4a14 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -26,8 +26,8 @@ import paddle
 class TestDirectory(unittest.TestCase):
     def get_import_command(self, module):
         paths = module.split('.')
-        if len(paths) <= 1:
-            return module
+        if len(paths) == 1:
+            return 'import {}'.format(module)
         package = '.'.join(paths[:-1])
         func = paths[-1]
         cmd = 'from {} import {}'.format(package, func)
@@ -39,9 +39,9 @@ class TestDirectory(unittest.TestCase):
             'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad',
             'paddle.no_grad', 'paddle.save', 'paddle.load',
             'paddle.static.save', 'paddle.static.load',
-            'paddle.BackwardStrategy', 'paddle.ParallelEnv',
-            'paddle.prepare_context', 'paddle.DataParallel', 'paddle.jit',
-            'paddle.jit.TracedLayer', 'paddle.jit.to_static',
+            'paddle.distributed.ParallelEnv',
+            'paddle.distributed.prepare_context', 'paddle.DataParallel',
+            'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
             'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
             'paddle.jit.save', 'paddle.jit.load', 'paddle.jit.SaveLoadConfig',
             'paddle.NoamDecay', 'paddle.PiecewiseDecay',
@@ -98,7 +98,6 @@ class TestDirectory(unittest.TestCase):
             'paddle.imperative.enable', 'paddle.imperative.guard',
             'paddle.imperative.grad', 'paddle.imperative.no_grad',
             'paddle.imperative.save', 'paddle.imperative.load',
-            'paddle.imperative.BackwardStrategy',
             'paddle.imperative.ParallelEnv',
             'paddle.imperative.prepare_context',
             'paddle.imperative.DataParalell', 'paddle.imperative.jit',
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index ba292f2d87c376ace317fc3fb9b81ce5c5596eb2..faff81fa84fb5fa66c9ff14f782d2301e3964672 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -23,8 +23,11 @@ import subprocess
 import six
 import argparse
 import pickle
+import random
 import numpy as np
 import time
+
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler
 import paddle.fluid.dygraph as dygraph
@@ -382,22 +385,22 @@ class TestParallelDyGraphRunnerBase(object):
         raise NotImplementedError(
             "train_one_loop should be implemented by the child classes.")
 
+    def _get_data(self, batch, args):
+        if args.update_method != "local":
+            new_batch = []
+            for offset, item in enumerate(batch):
+                if offset % 2 == args.trainer_id:
+                    new_batch.append(item)
+            return new_batch
+        else:
+            return batch
+
     def run_trainer(self, args):
 
         seed = 90
         device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         place = fluid.CUDAPlace(device_id)
 
-        def _get_data(batch):
-            if args.update_method != "local":
-                new_batch = []
-                for offset, item in enumerate(batch):
-                    if offset % 2 == args.trainer_id:
-                        new_batch.append(item)
-                return new_batch
-            else:
-                return batch
-
         with fluid.dygraph.guard(place):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
@@ -422,7 +425,7 @@ class TestParallelDyGraphRunnerBase(object):
             out_losses = []
             print_to_err(type(self).__name__, "begin to run dygraph training")
             for step_id, data in enumerate(train_reader()):
-                data = _get_data(data)
+                data = self._get_data(data, args)
                 if step_id == RUN_STEP:
                     break
                 loss = self.run_one_loop(model, opt, data)
@@ -444,6 +447,47 @@ class TestParallelDyGraphRunnerBase(object):
                 model.clear_gradients()
         print_to_out(out_losses)
 
+    def run_trainer_with_spawn(self, args):
+        # 1. enable dygraph
+        paddle.disable_static()
+
+        # 2. init seed
+        seed = 90
+        paddle.static.default_startup_program().random_seed = seed
+        paddle.static.default_main_program().random_seed = seed
+        np.random.seed(seed)
+        random.seed = seed
+        # get trainer id
+        args.trainer_id = paddle.distributed.get_rank()
+
+        # 3. init parallel env
+        if args.update_method == "nccl2":
+            paddle.distributed.init_parallel_env()
+
+        # 4. train model
+        model, train_reader, opt = self.get_model()
+        if args.update_method == "nccl2":
+            model = paddle.DataParallel(model)
+
+        out_losses = []
+        for step_id, data in enumerate(train_reader()):
+            data = self._get_data(data, args)
+            if step_id == RUN_STEP:
+                break
+            loss = self.run_one_loop(model, opt, data)
+            out_losses.append(loss.numpy())
+
+            if args.update_method == "nccl2":
+                loss = model.scale_loss(loss)
+
+            loss.backward()
+            if args.update_method == "nccl2":
+                model.apply_collective_grads()
+
+            opt.minimize(loss)
+            model.clear_gradients()
+        return out_losses
+
 
 def runtime_main(test_class):
     parser = argparse.ArgumentParser(description='Run dist test.')
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
index 28bd637726ebe6693296c2c585bed69cb1f1ab24..9df55a6b873e28a6e479fd05b31074802eb19bb7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -17,7 +17,7 @@ import time
 import unittest
 
 import paddle
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet.base.role_maker as role_maker
 
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
@@ -55,7 +55,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -100,7 +100,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
index 9cd35f1754ff7f81f918635d6bffc566228b0543..59ca41a11e325cfb66a3a3eaadb4eca6f9764212 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
@@ -15,7 +15,7 @@
 import unittest
 import paddle
 import os
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet.base.role_maker as role_maker
 import time
 
 
@@ -55,7 +55,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
         strategy.a_sync_configs = {"k_steps": 100}
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
index c8130d62c304b9c1a9aa6623378456806adf3da2..e0993e022e1b9570773634ec829b088c5ff145ea 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -16,7 +16,7 @@ import unittest
 import paddle
 import os
 import paddle.distributed.fleet as fleet
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet.base.role_maker as role_maker
 import time
 
 
@@ -47,7 +47,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = False
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index f72850f949715caa7282e6ca85f148a75ebc1074..beb0069eb770f25d7834749ff9c188e5252e13c0 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -31,10 +31,11 @@ import time
 import tempfile
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
 from paddle.distributed.fleet.base.util_factory import fleet_util
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+from paddle.distributed.fleet import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
 __all__ = ['FleetDistRunnerBase', 'TestFleetBase', 'runtime_main']
@@ -56,7 +57,7 @@ class FleetDistRunnerBase(object):
         if args.role.upper() == "PSERVER":
             role = role_maker.UserDefinedRoleMaker(
                 is_collective=False,
-                init_gloo=True,
+                init_gloo=False,
                 path=args.gloo_path,
                 current_id=args.current_id,
                 role=role_maker.Role.SERVER,
@@ -65,7 +66,7 @@ class FleetDistRunnerBase(object):
         else:
             role = role_maker.UserDefinedRoleMaker(
                 is_collective=False,
-                init_gloo=True,
+                init_gloo=False,
                 path=args.gloo_path,
                 current_id=args.current_id,
                 role=role_maker.Role.WORKER,
@@ -75,21 +76,23 @@ class FleetDistRunnerBase(object):
         return role
 
     def build_strategy(self, args):
-        self.strategy = None
+        self.strategy = paddle.distributed.fleet.DistributedStrategy()
+        self.strategy.a_sync = False
         if args.mode == "async":
-            self.strategy = StrategyFactory.create_async_strategy()
-        elif args.mode == "sync":
-            self.strategy = StrategyFactory.create_sync_strategy()
-        elif args.mode == "half_async":
-            self.strategy = StrategyFactory.create_half_async_strategy()
+            self.strategy = paddle.distributed.fleet.DistributedStrategy()
+            self.strategy.a_sync = True
         elif args.mode == "geo":
-            self.strategy = StrategyFactory.create_geo_strategy(
-                args.geo_sgd_need_push_nums)
+            self.strategy = paddle.distributed.fleet.DistributedStrategy()
+            self.strategy.a_sync = True
+            self.strategy.a_sync_configs = {
+                "k_steps": args.geo_sgd_need_push_nums
+            }
         self.dump_param = os.getenv("dump_param", "").split(",")
         self.dump_fields = os.getenv("dump_fields", "").split(",")
         self.dump_fields_path = os.getenv("dump_fields_path", "")
         debug = int(os.getenv("Debug", "0"))
-        if debug:
+        # TODO(update strategy to support dump params)
+        if False:  #debug:
             self.strategy.set_debug_opt({
                 "dump_param": self.dump_param,
                 "dump_fields": self.dump_fields,
@@ -122,7 +125,7 @@ class FleetDistRunnerBase(object):
                     staircase=True))
         else:
             optimizer = fluid.optimizer.SGD(LEARNING_RATE)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
     def run_pserver(self, args):
@@ -157,7 +160,13 @@ class TestFleetBase(unittest.TestCase):
     def _setup_config(self):
         raise NotImplementedError("tests should have _setup_config implemented")
 
+    def tearDown(self):
+        t = time.time() - self.startTime
+        print('%s: %.3f' % (self.__class__.__name__, t))
+
     def setUp(self):
+        self.startTime = time.time()
+
         self._mode = "sync"
         self._reader = "pyreader"
         self._trainers = 2
@@ -278,6 +287,23 @@ class TestFleetBase(unittest.TestCase):
 
         tr0_ret = tr0.returncode
         tr1_ret = tr0.returncode
+        if tr0_ret != 0:
+            print(
+                "========================Error tr0_err begin==========================="
+            )
+            os.system("cat {}".format(tempfile.gettempdir() + "/tr0_err.log"))
+            print(
+                "========================Error tr0_err end==========================="
+            )
+
+        if tr1_ret != 0:
+            print(
+                "========================Error tr1_err begin==========================="
+            )
+            os.system("cat {}".format(tempfile.gettempdir() + "/tr1_err.log"))
+            print(
+                "========================Error tr1_err end==========================="
+            )
 
         self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
         self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index 18629c4f996a6d068339bd6cad494e8e8d21123f..b506f179143412e2bdb5d9eda511d90a0a3eea6d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -22,7 +22,7 @@ from test_dist_fleet_base import TestFleetBase
 
 class TestDistMnistSync2x2(TestFleetBase):
     def _setup_config(self):
-        self._mode = "async"
+        self._mode = "sync"
         self._reader = "pyreader"
 
     def check_with_place(self,
@@ -123,7 +123,7 @@ class TestDistMnistAsyncDataset2x2(TestFleetBase):
 
 class TestDistCtrHalfAsync2x2(TestFleetBase):
     def _setup_config(self):
-        self._mode = "half_async"
+        self._mode = "async"
         self._reader = "pyreader"
 
     def check_with_place(self,
@@ -156,5 +156,40 @@ class TestDistCtrHalfAsync2x2(TestFleetBase):
             "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
 
+class TestDistCtrPsGpuPyreaderAsync2x2(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "async"
+        self._reader = "pyreader"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "30000",  # 5sec to fail fast
+            "http_proxy": "",
+            "FLAGS_communicator_send_queue_size": "2",
+            "FLAGS_communicator_max_merge_var_num": "2",
+            "CPU_NUM": "2",
+            "SAVE_MODEL": "1"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_ctr_ps_gpu.py", delta=1e-5, check_error_log=True)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py
index 34b8300383951f5636e5c86a39e76aa1a2be8e5d..b4bc0d8dadce44c8f711189466f34fb5cd76f39f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py
@@ -168,6 +168,8 @@ class TestDistGloo_2x2(TestFleetBase):
             "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36013,127.0.0.1:36014",
             "PADDLE_TRAINERS_NUM": "2",
             "PADDLE_PSERVER_ID": "0",
+            #GLOO FLAG
+            "PADDLE_WITH_GLOO": "1",
         }
 
         required_envs.update(need_envs)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
index 5fcf5d894b2ee7a7d795008fcb4edef927827dc3..e7b10be2349cce755267297025ca8520b6d494ee 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -21,7 +21,7 @@ import shutil
 
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.distributed.fleet as fleet
 
 # For Net
@@ -165,7 +165,7 @@ class TestPSPassWithBow(unittest.TestCase):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(loss)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
index 761d57408b9a8f9e52419331bfb0bca5b0135c30..1062123948481a4164a12a4bed818b964923006f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
@@ -113,8 +113,8 @@ class TranspilerAsyncLRDecayTest(unittest.TestCase):
                          ["listen_and_serv"])
         # block1: sum,cast,scale,floor,fill_constant,elementwise_pow,scale
         self.assertEqual([op.type for op in pserver.blocks[1].ops], [
-            "sum", "cast", "scale", "floor", "fill_constant", "elementwise_pow",
-            "scale"
+            "sum", "cast", "fill_constant", "elementwise_div", "floor",
+            "fill_constant", "elementwise_pow", "scale"
         ])
 
         # block1~2: optimize pass
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
new file mode 100644
index 0000000000000000000000000000000000000000..533ad9604cf0d879371796fb197e61e931fb479f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -0,0 +1,938 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import paddle
+from paddle import fluid
+from paddle.fluid import layers
+from paddle.distribution import *
+import math
+
+
+class DistributionNumpy():
+    def sample(self):
+        raise NotImplementedError
+
+    def entropy(self):
+        raise NotImplementedError
+
+    def kl_divergence(self, other):
+        raise NotImplementedError
+
+    def log_prob(self, value):
+        raise NotImplementedError
+
+    def probs(self, value):
+        raise NotImplementedError
+
+
+class UniformNumpy(DistributionNumpy):
+    def __init__(self, low, high):
+        self.low = np.array(low).astype('float32')
+        self.high = np.array(high).astype('float32')
+
+    def sample(self, shape):
+        shape = tuple(shape) + (self.low + self.high).shape
+        return self.low + (np.random.uniform(size=shape) *
+                           (self.high - self.low))
+
+    def log_prob(self, value):
+        lb = np.less(self.low, value).astype('float32')
+        ub = np.less(value, self.high).astype('float32')
+        return np.log(lb * ub) - np.log(self.high - self.low)
+
+    def probs(self, value):
+        lb = np.less(self.low, value).astype('float32')
+        ub = np.less(value, self.high).astype('float32')
+        return (lb * ub) / (self.high - self.low)
+
+    def entropy(self):
+        return np.log(self.high - self.low)
+
+
+class NormalNumpy(DistributionNumpy):
+    def __init__(self, loc, scale):
+        self.loc = np.array(loc).astype('float32')
+        self.scale = np.array(scale).astype('float32')
+
+    def sample(self, shape):
+        shape = tuple(shape) + (self.loc + self.scale).shape
+        return self.loc + (np.random.randn(*shape) * self.scale)
+
+    def log_prob(self, value):
+        var = self.scale * self.scale
+        log_scale = np.log(self.scale)
+        return -((value - self.loc) * (value - self.loc)) / (
+            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
+
+    def probs(self, value):
+        var = self.scale * self.scale
+        return np.exp(-1. * ((value - self.loc) * (value - self.loc)) /
+                      (2. * var)) / (math.sqrt(2 * math.pi) * self.scale)
+
+    def entropy(self):
+        return 0.5 + 0.5 * np.log(np.array(2. * math.pi).astype(
+            'float32')) + np.log(self.scale)
+
+    def kl_divergence(self, other):
+        var_ratio = (self.scale / other.scale)
+        var_ratio = var_ratio * var_ratio
+        t1 = ((self.loc - other.loc) / other.scale)
+        t1 = (t1 * t1)
+        return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
+
+
+class DistributionTest(unittest.TestCase):
+    def setUp(self, use_gpu=False):
+        self.use_gpu = use_gpu
+        if not use_gpu:
+            place = fluid.CPUPlace()
+            self.gpu_id = -1
+        else:
+            place = fluid.CUDAPlace(0)
+            self.gpu_id = 0
+        self.executor = fluid.Executor(place)
+
+    def build_normal_common_net(self, batch_size, dims, sample_shape, loc_float,
+                                scale_float, other_loc_float, other_scale_float,
+                                scale_np, other_scale_np, loc_np, other_loc_np,
+                                loc, scale, other_loc, other_scale, values):
+        """Generate Normal object and get the output of its methods including
+        ``sample``, ``entropy``, ``log_prob``, ``probs`` and ``kl_divergence``.
+        Parameters ``loc`` and ``scale`` have different data types to test different situations.
+
+        Args:
+          batch_size(int): The first dimension of the shape of parameters(loc and scale).
+          dims(int): The second dimension of the shape of parameters.
+          sample_shape(int): The sample value used in ``sample`` method.
+          loc_float(float): Generated in function ``get_normal_random_input``, loc is a float number.
+          scale_float(float): Generated in function ``get_normal_random_input``, scale is a float number.
+          other_loc_float(float): Generated in function ``get_normal_random_input``, other_loc is a
+            float number. It is the first parameter in another Normal object used in ``kl_divergence``
+            method.
+          other_scale_float(float): Generated in function ``get_normal_random_input``, other_scale is a
+            float number. It is the second parameter in another Normal object used in ``kl_divergence``
+            method.
+          scale_np(numpy.ndarray): Generated in function ``get_normal_random_input``, An numpy array
+            whose shape is [batch_size, dims].
+          other_scale_np(numpy.ndarray): Generated in function ``get_normal_random_input``, other_scale_np
+            is an numpy array. It is the second parameter in another Normal object used in ``kl_divergence``
+            method.
+          loc_np(numpy.ndarray): Generated in function ``get_normal_random_input``, An numpy array
+            whose shape is [batch_size, dims].
+          other_loc_np(numpy.ndarray): Generated in function ``get_normal_random_input``, other_loc_np
+            is an numpy array. It is the first parameter in another Normal object used in ``kl_divergence``
+            method.
+          loc(Tensor): In dynamic mode, loc is generated in ``build_normal_dygraph``, it's a Tensor filled
+            with ``loc_np`` data. In static mode, loc is generated in ``build_normal_static``, ``layers.data``
+             method is used to get a Placeholder whose shape is [dims].
+          scale(Tensor): In dynamic mode, scale is generated in ``build_normal_dygraph``, it's a Tensor filled
+            with ``scale_np`` data. In static mode, scale is generated in ``build_normal_static``, ``layers.data``
+             method is used to get a Placeholder whose shape is [dims].
+          other_loc(Tensor): In dynamic mode, other_loc is generated in ``build_normal_dygraph``, it's a Tensor
+            filled with ``other_loc_np`` data. In static mode, other_loc is generated in ``build_normal_static``,
+             ``layers.data`` method is used to get a Placeholder whose shape is [dims]. It is the first parameter
+              in another Normal object used in ``kl_divergence`` method.
+          other_scale(Tensor): In dynamic mode, other_scale is generated in ``build_normal_dygraph``, it's a Tensor
+            filled with ``other_scale_np`` data. In static mode, other_scale is generated in ``build_normal_static``,
+             ``layers.data`` method is used to get a Placeholder whose shape is [dims]. It is the second parameter
+              in another Normal object used in ``kl_divergence`` method.
+          values(Tensor): In dynamic mode, values is generated in ``build_normal_dygraph``, it's a Tensor filled with
+             ``values_np`` data. In static mode, values is generated in ``build_normal_static``, ``layers.data``
+             method is used to get a Placeholder whose shape is [dims].
+
+        Returns:
+          List: The elements of the list are the output of sample, entropy, log_prob, probs, kl_divergence methods.
+          The inputs' type of these methods can be float, np.ndarray and Tensor. And broadcast will be considered.
+
+        """
+        normal_int = Normal(int(loc_float), int(scale_float))
+        normal_float = Normal(loc_float, scale_float)
+        other_normal_float = Normal(other_loc_float, other_scale_float)
+
+        normal_float_np_broadcast = Normal(loc_float, scale_np)
+        other_normal_float_np_broadcast = Normal(other_loc_float,
+                                                 other_scale_np)
+
+        normal_np = Normal(loc_np, scale_np)
+        other_normal_np = Normal(other_loc_np, other_scale_np)
+
+        normal_variable = Normal(loc, scale)
+        other_normal_variable = Normal(other_loc, other_scale)
+
+        sample_int = normal_int.sample([batch_size, dims])
+        sample_float = normal_float.sample([batch_size, dims])
+        sample_float_np_broadcast = normal_float_np_broadcast.sample(
+            [batch_size, dims])
+        sample_np = normal_np.sample([batch_size, dims])
+        sample_variable = normal_variable.sample([batch_size, dims])
+
+        sample_int_diff = normal_int.sample([sample_shape])
+        sample_float_diff = normal_float.sample([sample_shape])
+        sample_float_np_broadcast_diff = normal_float_np_broadcast.sample(
+            [sample_shape])
+        sample_np_diff = normal_np.sample([sample_shape])
+        sample_variable_diff = normal_variable.sample([sample_shape])
+
+        entropy_int = normal_int.entropy()
+        entropy_float = normal_float.entropy()
+        entropy_float_np_broadcast = normal_float_np_broadcast.entropy()
+        entropy_np = normal_np.entropy()
+        entropy_variable = normal_variable.entropy()
+
+        lp_float_np_broadcast = normal_float_np_broadcast.log_prob(values)
+        lp_np = normal_np.log_prob(values)
+        lp_variable = normal_variable.log_prob(values)
+
+        p_float_np_broadcast = normal_float_np_broadcast.probs(values)
+        p_np = normal_np.probs(values)
+        p_variable = normal_variable.probs(values)
+
+        kl_float = normal_float.kl_divergence(other_normal_float)
+        kl_float_np_broadcast = normal_float_np_broadcast.kl_divergence(
+            other_normal_float_np_broadcast)
+        kl_np = normal_np.kl_divergence(other_normal_np)
+        kl_variable = normal_variable.kl_divergence(other_normal_variable)
+
+        fetch_list = [
+            sample_int, sample_float, sample_float_np_broadcast, sample_np,
+            sample_variable, sample_int_diff, sample_float_diff,
+            sample_float_np_broadcast_diff, sample_np_diff,
+            sample_variable_diff, entropy_int, entropy_float,
+            entropy_float_np_broadcast, entropy_np, entropy_variable,
+            lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
+            p_np, p_variable, kl_float, kl_float_np_broadcast, kl_np,
+            kl_variable
+        ]
+        return fetch_list
+
+    def build_normal_static(self, test_program, batch_size, dims, sample_shape,
+                            loc_float, scale_float, other_loc_float,
+                            other_scale_float, scale_np, other_scale_np, loc_np,
+                            other_loc_np, values_np):
+        """
+        In static mode, generate feed data of Normal network, and get output fetch_list using
+        ``build_normal_common_net``.
+
+        Args:
+          test_program: In static mode, the Program object.
+          other args can refer to function ``build_normal_common_net``.
+
+        Returns:
+          feed_vars: The feed data of Normal network in static mode.
+          fetch_list: The output is generated by function ``build_normal_common_net``.
+        """
+        with fluid.program_guard(test_program):
+            loc = layers.data(name='loc', shape=[dims], dtype='float32')
+            scale = layers.data(name='scale', shape=[dims], dtype='float32')
+
+            other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float32')
+            other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float32')
+
+            values = layers.data(name='values', shape=[dims], dtype='float32')
+
+            fetch_list = self.build_normal_common_net(
+                batch_size, dims, sample_shape, loc_float, scale_float,
+                other_loc_float, other_scale_float, scale_np, other_scale_np,
+                loc_np, other_loc_np, loc, scale, other_loc, other_scale,
+                values)
+
+        feed_vars = {
+            'loc': loc_np,
+            'scale': scale_np,
+            'other_loc': other_loc_np,
+            'other_scale': other_scale_np,
+            'values': values_np
+        }
+        return feed_vars, fetch_list
+
+    def build_normal_dygraph(self, batch_size, dims, sample_shape, loc_float,
+                             scale_float, other_loc_float, other_scale_float,
+                             scale_np, other_scale_np, loc_np, other_loc_np,
+                             values_np):
+        """
+        In dynamic mode, generate input data of Normal network, and get output fetch_list using
+        ``build_normal_common_net``.
+
+        Args:
+          refer to function ``build_normal_common_net``.
+
+        Returns:
+          fetch_list_numpy: The output is generated by function ``build_normal_common_net``. Transform
+          these tensor to numpy.ndarray.
+        """
+        loc = paddle.to_tensor(loc_np)
+        scale = paddle.to_tensor(scale_np)
+        other_loc = paddle.to_tensor(other_loc_np)
+        other_scale = paddle.to_tensor(other_scale_np)
+        values = paddle.to_tensor(values_np)
+
+        fetch_list = self.build_normal_common_net(
+            batch_size, dims, sample_shape, loc_float, scale_float,
+            other_loc_float, other_scale_float, scale_np, other_scale_np,
+            loc_np, other_loc_np, loc, scale, other_loc, other_scale, values)
+        fetch_list_numpy = [t.numpy() for t in fetch_list]
+        return fetch_list_numpy
+
+    def get_normal_random_input(self, batch_size, dims):
+        """
+        Generate input data ``loc`` and ``scale`` used in Normal network.
+
+        Args:
+          refer to function ``build_normal_common_net``.
+
+        Returns:
+          List: Different data type of ``loc`` and ``scale``, including float, numpy.ndarray.
+          By the way, ``other_loc`` and ``other_scale`` are used in ``kl_divergence`` method.
+          refer to ``args`` in function ``build_normal_common_net``.
+        """
+        loc_np = np.random.randn(batch_size, dims).astype('float32')
+        other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+
+        loc_float = (np.random.ranf() - 0.5) * 4
+        scale_float = (np.random.ranf() - 0.5) * 4
+        while scale_float < 0:
+            scale_float = (np.random.ranf() - 0.5) * 4
+
+        other_loc_float = (np.random.ranf() - 0.5) * 4
+        other_scale_float = (np.random.ranf() - 0.5) * 4
+        while other_scale_float < 0:
+            other_scale_float = (np.random.ranf() - 0.5) * 4
+
+        scale_np = np.random.randn(batch_size, dims).astype('float32')
+        other_scale_np = np.random.randn(batch_size, dims).astype('float32')
+        values_np = np.random.randn(batch_size, dims).astype('float32')
+
+        while not np.all(scale_np > 0):
+            scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(other_scale_np > 0):
+            other_scale_np = np.random.randn(batch_size, dims).astype('float32')
+        return [
+            loc_np, other_loc_np, loc_float, scale_float, other_loc_float,
+            other_scale_float, scale_np, other_scale_np, values_np
+        ]
+
+    def compare_normal_with_numpy(self,
+                                  data_list,
+                                  output_list,
+                                  batch_size=2,
+                                  dims=3,
+                                  sample_shape=7,
+                                  tolerance=1e-6):
+        """
+        Compare the outputs of Normal's methods in paddle and numpy. If the outputs are not consistent,
+        raise errors.
+
+        Args:
+          data_list: Input data generated by function ``get_normal_random_input``.
+          output_list: The outputs of Normal's methods in static or dynamic mode.
+          batch_size(int): The first dimension of the shape of parameters(loc and scale).
+          dims(int): The second dimension of the shape of parameters.
+          sample_shape(int): The sample value used in ``sample`` method.
+          tolerance(float): The tolerance of the error.
+        """
+        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
+
+        np_normal_int = NormalNumpy(int(loc_float), int(scale_float))
+        np_normal_float = NormalNumpy(loc_float, scale_float)
+        np_other_normal_float = NormalNumpy(other_loc_float, other_scale_float)
+        np_normal_float_np_broadcast = NormalNumpy(loc_float, scale_np)
+        np_other_normal_float_np_broadcast = NormalNumpy(other_loc_float,
+                                                         other_scale_np)
+        np_normal = NormalNumpy(loc_np, scale_np)
+        np_other_normal = NormalNumpy(other_loc_np, other_scale_np)
+
+        gt_sample_int = np_normal_int.sample([batch_size, dims])
+        gt_sample_float = np_normal_float.sample([batch_size, dims])
+        gt_sample_float_np_broadcast = np_normal_float_np_broadcast.sample(
+            [batch_size, dims])
+        gt_sample_np = np_normal.sample([batch_size, dims])
+
+        gt_sample_int_diff = np_normal_int.sample([sample_shape])
+        gt_sample_float_diff = np_normal_float.sample([sample_shape])
+        gt_sample_float_np_broadcast_diff = np_normal_float_np_broadcast.sample(
+            [sample_shape])
+        gt_sample_np_diff = np_normal.sample([sample_shape])
+
+        gt_entropy_int = np_normal_int.entropy()
+        gt_entropy_float = np_normal_float.entropy()
+        gt_entropy_float_np_broadcast = np_normal_float_np_broadcast.entropy()
+        gt_entropy = np_normal.entropy()
+        gt_lp_float_np_broadcast = np_normal_float_np_broadcast.log_prob(
+            values_np)
+        gt_lp = np_normal.log_prob(values_np)
+        gt_p_float_np_broadcast = np_normal_float_np_broadcast.probs(values_np)
+        gt_p = np_normal.probs(values_np)
+        gt_kl_float = np_normal_float.kl_divergence(np_other_normal_float)
+        gt_kl_float_np_broadcast = np_normal_float_np_broadcast.kl_divergence(
+            np_other_normal_float_np_broadcast)
+        gt_kl = np_normal.kl_divergence(np_other_normal)
+
+        [
+            output_sample_int, output_sample_float,
+            output_sample_float_np_broadcast, output_sample_np,
+            output_sample_variable, output_sample_int_diff,
+            output_sample_float_diff, output_sample_float_np_broadcast_diff,
+            output_sample_np_diff, output_sample_variable_diff,
+            output_entropy_int, output_entropy_float,
+            output_entropy_float_np_broadcast, output_entropy_np,
+            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
+            output_lp_variable, output_p_float_np_broadcast, output_p_np,
+            output_p_variable, output_kl_float, output_kl_float_np_broadcast,
+            output_kl_np, output_kl_variable
+        ] = output_list
+
+        np.testing.assert_equal(output_sample_int.shape, gt_sample_int.shape)
+        np.testing.assert_equal(output_sample_float.shape,
+                                gt_sample_float.shape)
+        np.testing.assert_equal(output_sample_float_np_broadcast.shape,
+                                gt_sample_float_np_broadcast.shape)
+        np.testing.assert_equal(output_sample_np.shape, gt_sample_np.shape)
+        np.testing.assert_equal(output_sample_variable.shape,
+                                gt_sample_np.shape)
+        np.testing.assert_equal(output_sample_int_diff.shape,
+                                gt_sample_int_diff.shape)
+        np.testing.assert_equal(output_sample_float_diff.shape,
+                                gt_sample_float_diff.shape)
+        np.testing.assert_equal(output_sample_float_np_broadcast_diff.shape,
+                                gt_sample_float_np_broadcast_diff.shape)
+        np.testing.assert_equal(output_sample_np_diff.shape,
+                                gt_sample_np_diff.shape)
+        np.testing.assert_equal(output_sample_variable_diff.shape,
+                                gt_sample_np_diff.shape)
+        np.testing.assert_allclose(
+            output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_float,
+            gt_entropy_float,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_float_np_broadcast,
+            gt_entropy_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_float_np_broadcast,
+            gt_lp_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_float_np_broadcast,
+            gt_p_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_np, gt_p, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_variable, gt_p, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_kl_float, gt_kl_float, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_kl_float_np_broadcast,
+            gt_kl_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_kl_np, gt_kl, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_kl_variable, gt_kl, rtol=tolerance, atol=tolerance)
+
+    def test_normal_distribution_static(self,
+                                        batch_size=2,
+                                        dims=3,
+                                        sample_shape=7,
+                                        tolerance=1e-6):
+        """
+        Test Normal's methods in static mode.
+
+        Args:
+          refer to ``compare_normal_with_numpy`` function.
+        """
+        test_program = fluid.Program()
+        data_list = self.get_normal_random_input(batch_size, dims)
+        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
+
+        feed_vars, fetch_list = self.build_normal_static(
+            test_program, batch_size, dims, sample_shape, loc_float,
+            scale_float, other_loc_float, other_scale_float, scale_np,
+            other_scale_np, loc_np, other_loc_np, values_np)
+        self.executor.run(fluid.default_startup_program())
+
+        output_list = self.executor.run(program=test_program,
+                                        feed=feed_vars,
+                                        fetch_list=fetch_list)
+
+        self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
+                                       sample_shape, tolerance)
+
+    def test_normal_distribution_dygraph(self,
+                                         batch_size=2,
+                                         dims=3,
+                                         sample_shape=7,
+                                         tolerance=1e-6):
+        """
+        Test Normal's methods in dynamic mode.
+
+        Args:
+          refer to ``compare_normal_with_numpy`` function.
+        """
+        paddle.disable_static()
+        data_list = self.get_normal_random_input(batch_size, dims)
+        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
+
+        output_list = self.build_normal_dygraph(
+            batch_size, dims, sample_shape, loc_float, scale_float,
+            other_loc_float, other_scale_float, scale_np, other_scale_np,
+            loc_np, other_loc_np, values_np)
+
+        self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
+                                       sample_shape, tolerance)
+        paddle.enable_static()
+
+    def build_uniform_common_net(self, batch_size, dims, sample_shape,
+                                 low_float, high_float, high_np, low_np,
+                                 values_np, low, high, values):
+        """Generate Uniform object and get the output of its methods including ``sample``, ``entropy``,
+         ``log_prob`` and ``probs``.
+        Parameters ``low`` and ``high`` have different data types to test different situations.
+
+        Args:
+          batch_size(int): The first dimension of the shape of parameters(low and high).
+          dims(int): The second dimension of the shape of parameters.
+          sample_shape(int): The sample value used in ``sample`` method.
+          low_float(float): Parameter ``low`` is a float number.
+          high_float(float): Parameter ``high`` is a float number.
+          high_np(numpy.ndarray): An numpy array whose shape is [batch_size, dims].
+          low_np(numpy.ndarray): An numpy array whose shape is [batch_size, dims].
+          values_np(numpy.ndarray): The input of ``log_prob`` and ``probs`` methods. An numpy array whose
+            shape is [batch_size, dims].
+          low(Tensor): In dynamic mode, low is generated in ``build_uniform_dygraph``, it's a Tensor filled
+            with ``low_np`` data. In static mode, low is generated in ``build_uniform_static``.
+          high(Tensor): In dynamic mode, high is generated in ``build_uniform_dygraph``, it's a Tensor filled
+            with ``high_np`` data. In static mode, high is generated in ``build_uniform_static``.
+          values(Tensor): In dynamic mode, values is generated in ``build_uniform_dygraph``, it's a Tensor
+            filled with ``values_np`` data. In static mode, values is generated in ``build_uniform_static``.
+
+        Returns:
+          List: The elements of the list are the output of sample, entropy, log_prob, probs methods.
+          The inputs' type of these methods can be float, np.ndarray and Tensor. And broadcast will be
+           considered.
+
+        """
+        uniform_int = Uniform(int(low_float), int(high_float))
+        uniform_float = Uniform(low_float, high_float)
+        uniform_float_np_broadcast = Uniform(low_float, high_np)
+        uniform_np = Uniform(low_np, high_np)
+        uniform_variable = Uniform(low, high)
+
+        sample_int = uniform_int.sample([batch_size, dims])
+        sample_float = uniform_float.sample([batch_size, dims])
+        sample_float_np_broadcast = uniform_float_np_broadcast.sample(
+            [batch_size, dims])
+        sample_np = uniform_np.sample([batch_size, dims])
+        sample_variable = uniform_variable.sample([batch_size, dims])
+
+        sample_int_diff = uniform_int.sample([sample_shape])
+        sample_float_diff = uniform_float.sample([sample_shape])
+        sample_float_np_broadcast_diff = uniform_float_np_broadcast.sample(
+            [sample_shape])
+        sample_np_diff = uniform_np.sample([sample_shape])
+        sample_variable_diff = uniform_variable.sample([sample_shape])
+
+        entropy_int = uniform_int.entropy()
+        entropy_float = uniform_float.entropy()
+        entropy_float_np_broadcast = uniform_float_np_broadcast.entropy()
+        entropy_np = uniform_np.entropy()
+        entropy_variable = uniform_variable.entropy()
+
+        lp_float_np_broadcast = uniform_float_np_broadcast.log_prob(values)
+        lp_np = uniform_np.log_prob(values)
+        lp_variable = uniform_variable.log_prob(values)
+
+        p_float_np_broadcast = uniform_float_np_broadcast.probs(values)
+        p_np = uniform_np.probs(values)
+        p_variable = uniform_variable.probs(values)
+
+        fetch_list = [
+            sample_int, sample_float, sample_float_np_broadcast, sample_np,
+            sample_variable, sample_int_diff, sample_float_diff,
+            sample_float_np_broadcast_diff, sample_np_diff,
+            sample_variable_diff, entropy_int, entropy_float,
+            entropy_float_np_broadcast, entropy_np, entropy_variable,
+            lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
+            p_np, p_variable
+        ]
+        return fetch_list
+
+    def build_uniform_static(self, test_program, batch_size, dims, sample_shape,
+                             low_float, high_float, high_np, low_np, values_np):
+        """
+        In static mode, generate feed data of Uniform network, and get output fetch_list using
+        ``build_uniform_common_net``.
+
+        Args:
+          test_program: In static mode, the Program object.
+          other args can refer to function ``build_uniform_common_net``.
+
+        Returns:
+          feed_vars: The feed data of Uniform network in static mode.
+          fetch_list: The output is generated by function ``build_uniform_common_net``.
+        """
+        with fluid.program_guard(test_program):
+            low = layers.data(name='low', shape=[dims], dtype='float32')
+            high = layers.data(name='high', shape=[dims], dtype='float32')
+
+            values = layers.data(name='values', shape=[dims], dtype='float32')
+
+            fetch_list = self.build_uniform_common_net(
+                batch_size, dims, sample_shape, low_float, high_float, high_np,
+                low_np, values_np, low, high, values)
+
+        feed_vars = {'low': low_np, 'high': high_np, 'values': values_np}
+        return feed_vars, fetch_list
+
+    def build_uniform_dygraph(self, batch_size, dims, sample_shape, low_float,
+                              high_float, high_np, low_np, values_np):
+        """
+        In dynamic mode, generate input data of Uniform network, and get output fetch_list using
+        ``build_uniform_common_net``.
+
+        Args:
+          refer to function ``build_uniform_common_net``.
+
+        Returns:
+          fetch_list_numpy: The output is generated by function ``build_uniform_common_net``. Transform
+          these tensor to numpy.ndarray.
+        """
+        low = paddle.to_tensor(low_np)
+        high = paddle.to_tensor(high_np)
+        values = paddle.to_tensor(values_np)
+
+        fetch_list = self.build_uniform_common_net(
+            batch_size, dims, sample_shape, low_float, high_float, high_np,
+            low_np, values_np, low, high, values)
+        fetch_list_numpy = [t.numpy() for t in fetch_list]
+        return fetch_list_numpy
+
+    def compare_uniform_with_numpy(self,
+                                   data_list,
+                                   output_list,
+                                   batch_size=2,
+                                   dims=3,
+                                   sample_shape=7,
+                                   tolerance=1e-6):
+        """
+        Compare the outputs of Uniform's methods in paddle and numpy. If the outputs are not consistent,
+        raise errors.
+
+        Args:
+          data_list: Input data including float and numpy.ndarray type of ``low`` and ``high`` parameters.
+          output_list: The outputs of Uniform's methods in static or dynamic mode.
+          batch_size(int): The first dimension of the shape of parameters(low and high).
+          dims(int): The second dimension of the shape of parameters.
+          sample_shape(int): The sample value used in ``sample`` method.
+          tolerance(float): The tolerance of the error.
+        """
+        [low_np, low_float, high_float, high_np, values_np] = data_list
+
+        np_uniform_int = UniformNumpy(int(low_float), int(high_float))
+        np_uniform_float = UniformNumpy(low_float, high_float)
+        np_uniform_float_np_broadcast = UniformNumpy(low_float, high_np)
+        np_uniform = UniformNumpy(low_np, high_np)
+
+        gt_sample_int = np_uniform_int.sample([batch_size, dims])
+        gt_sample_float = np_uniform_float.sample([batch_size, dims])
+        gt_sample_float_np_broadcast = np_uniform_float_np_broadcast.sample(
+            [batch_size, dims])
+        gt_sample_np = np_uniform.sample([batch_size, dims])
+        gt_sample_int_diff = np_uniform_int.sample([sample_shape])
+        gt_sample_float_diff = np_uniform_float.sample([sample_shape])
+        gt_sample_float_np_broadcast_diff = np_uniform_float_np_broadcast.sample(
+            [sample_shape])
+        gt_sample_np_diff = np_uniform.sample([sample_shape])
+        gt_entropy_int = np_uniform_int.entropy()
+        gt_entropy_float = np_uniform_float.entropy()
+        gt_entropy_float_np_broadcast = np_uniform_float_np_broadcast.entropy()
+        gt_entropy = np_uniform.entropy()
+        gt_lp_float_np_broadcast = np_uniform_float_np_broadcast.log_prob(
+            values_np)
+        gt_lp = np_uniform.log_prob(values_np)
+        gt_p_float_np_broadcast = np_uniform_float_np_broadcast.probs(values_np)
+        gt_p = np_uniform.probs(values_np)
+
+        [
+            output_sample_int, output_sample_float,
+            output_sample_float_np_broadcast, output_sample_np,
+            output_sample_variable, output_sample_int_diff,
+            output_sample_float_diff, output_sample_float_np_broadcast_diff,
+            output_sample_np_diff, output_sample_variable_diff,
+            output_entropy_int, output_entropy_float,
+            output_entropy_float_np_broadcast, output_entropy_np,
+            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
+            output_lp_variable, output_p_float_np_broadcast, output_p_np,
+            output_p_variable
+        ] = output_list
+
+        np.testing.assert_equal(output_sample_int.shape, gt_sample_int.shape)
+        np.testing.assert_equal(output_sample_float.shape,
+                                gt_sample_float.shape)
+        np.testing.assert_equal(output_sample_float_np_broadcast.shape,
+                                gt_sample_float_np_broadcast.shape)
+        np.testing.assert_equal(output_sample_np.shape, gt_sample_np.shape)
+        np.testing.assert_equal(output_sample_variable.shape,
+                                gt_sample_np.shape)
+        np.testing.assert_equal(output_sample_int_diff.shape,
+                                gt_sample_int_diff.shape)
+        np.testing.assert_equal(output_sample_float_diff.shape,
+                                gt_sample_float_diff.shape)
+        np.testing.assert_equal(output_sample_float_np_broadcast_diff.shape,
+                                gt_sample_float_np_broadcast_diff.shape)
+        np.testing.assert_equal(output_sample_np_diff.shape,
+                                gt_sample_np_diff.shape)
+        np.testing.assert_equal(output_sample_variable_diff.shape,
+                                gt_sample_np_diff.shape)
+        np.testing.assert_allclose(
+            output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_float,
+            gt_entropy_float,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_float_np_broadcast,
+            gt_entropy_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_float_np_broadcast,
+            gt_lp_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_float_np_broadcast,
+            gt_p_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_np, gt_p, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_variable, gt_p, rtol=tolerance, atol=tolerance)
+
+    def test_uniform_distribution_static(self,
+                                         batch_size=2,
+                                         dims=3,
+                                         sample_shape=7,
+                                         tolerance=1e-6):
+        """
+        Test Uniform's methods in static mode.
+
+        Args:
+          refer to ``compare_uniform_with_numpy`` function.
+        """
+        test_program = fluid.Program()
+
+        low_np = np.random.randn(batch_size, dims).astype('float32')
+        low_float = np.random.uniform(-2, 1)
+        high_float = np.random.uniform(1, 3)
+        high_np = np.random.uniform(-5.0, 5.0,
+                                    (batch_size, dims)).astype('float32')
+        values_np = np.random.randn(batch_size, dims).astype('float32')
+
+        data_list = [low_np, low_float, high_float, high_np, values_np]
+
+        feed_vars, fetch_list = self.build_uniform_static(
+            test_program, batch_size, dims, sample_shape, low_float, high_float,
+            high_np, low_np, values_np)
+
+        self.executor.run(fluid.default_startup_program())
+
+        # result calculated by paddle
+        output_list = self.executor.run(program=test_program,
+                                        feed=feed_vars,
+                                        fetch_list=fetch_list)
+        self.compare_uniform_with_numpy(data_list, output_list, batch_size,
+                                        dims, sample_shape, tolerance)
+
+    def test_uniform_distribution_dygraph(self,
+                                          batch_size=2,
+                                          dims=3,
+                                          sample_shape=7,
+                                          tolerance=1e-6):
+        """
+        Test Uniform's methods in dynamic mode.
+
+        Args:
+          refer to ``compare_uniform_with_numpy`` function.
+        """
+        paddle.disable_static()
+
+        low_np = np.random.randn(batch_size, dims).astype('float32')
+        low_float = np.random.uniform(-2, 1)
+        high_float = np.random.uniform(1, 3)
+        high_np = np.random.uniform(-5.0, 5.0,
+                                    (batch_size, dims)).astype('float32')
+        values_np = np.random.randn(batch_size, dims).astype('float32')
+
+        data_list = [low_np, low_float, high_float, high_np, values_np]
+        output_list = self.build_uniform_dygraph(batch_size, dims, sample_shape,
+                                                 low_float, high_float, high_np,
+                                                 low_np, values_np)
+
+        self.compare_uniform_with_numpy(data_list, output_list, batch_size,
+                                        dims, sample_shape, tolerance)
+        paddle.enable_static()
+
+
+class DistributionTestError(unittest.TestCase):
+    def test_distribution_error(self):
+        distribution = Distribution()
+
+        self.assertRaises(NotImplementedError, distribution.sample)
+        self.assertRaises(NotImplementedError, distribution.entropy)
+
+        normal = Normal(0.0, 1.0)
+        self.assertRaises(NotImplementedError, distribution.kl_divergence,
+                          normal)
+
+        value_npdata = np.array([0.8], dtype="float32")
+        value_tensor = layers.create_tensor(dtype="float32")
+        self.assertRaises(NotImplementedError, distribution.log_prob,
+                          value_tensor)
+        self.assertRaises(NotImplementedError, distribution.probs, value_tensor)
+
+    def test_normal_error(self):
+        normal = Normal(0.0, 1.0)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, normal.log_prob, value)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, normal.probs, value)
+
+        shape = 1.0
+        # type of shape must be list
+        self.assertRaises(TypeError, normal.sample, shape)
+
+        seed = 1.0
+        # type of seed must be int
+        self.assertRaises(TypeError, normal.sample, [2, 3], seed)
+
+        normal_other = Uniform(1.0, 2.0)
+        # type of other must be an instance of Normal
+        self.assertRaises(TypeError, normal.kl_divergence, normal_other)
+
+    def test_uniform_error(self):
+        uniform = Uniform(0.0, 1.0)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, uniform.log_prob, value)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, uniform.probs, value)
+
+        shape = 1.0
+        # type of shape must be list
+        self.assertRaises(TypeError, uniform.sample, shape)
+
+        seed = 1.0
+        # type of seed must be int
+        self.assertRaises(TypeError, uniform.sample, [2, 3], seed)
+
+
+class DistributionTestName(unittest.TestCase):
+    def get_prefix(self, string):
+        return (string.split('.')[0])
+
+    def test_normal_name(self):
+        name = 'test_normal'
+        normal1 = Normal(0.0, 1.0, name=name)
+        self.assertEqual(normal1.name, name)
+
+        normal2 = Normal(0.0, 1.0)
+        self.assertEqual(normal2.name, 'Normal')
+
+        paddle.enable_static()
+
+        sample = normal1.sample([2])
+        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
+
+        entropy = normal1.entropy()
+        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
+
+        value_npdata = np.array([0.8], dtype="float32")
+        value_tensor = layers.create_tensor(dtype="float32")
+        layers.assign(value_npdata, value_tensor)
+
+        lp = normal1.log_prob(value_tensor)
+        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
+
+        p = normal1.probs(value_tensor)
+        self.assertEqual(self.get_prefix(p.name), name + '_probs')
+
+        kl = normal1.kl_divergence(normal2)
+        self.assertEqual(self.get_prefix(kl.name), name + '_kl_divergence')
+
+    def test_uniform_name(self):
+        name = 'test_uniform'
+        uniform1 = Uniform(0.0, 1.0, name=name)
+        self.assertEqual(uniform1.name, name)
+
+        uniform2 = Uniform(0.0, 1.0)
+        self.assertEqual(uniform2.name, 'Uniform')
+
+        paddle.enable_static()
+
+        sample = uniform1.sample([2])
+        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
+
+        entropy = uniform1.entropy()
+        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
+
+        value_npdata = np.array([0.8], dtype="float32")
+        value_tensor = layers.create_tensor(dtype="float32")
+        layers.assign(value_npdata, value_tensor)
+
+        lp = uniform1.log_prob(value_tensor)
+        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
+
+        p = uniform1.probs(value_tensor)
+        self.assertEqual(self.get_prefix(p.name), name + '_probs')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index cc3910d1b0c828f572f5e618b7aa9c55ecb93987..d18c8e25974441a6989b18a0fe13bac91251de9d 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest, skip_check_grad_ci
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
@@ -236,5 +237,501 @@ class TestDropoutOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_dtype)
 
 
+class TestDropoutFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[40, 40], dtype="float32")
+            res1 = paddle.nn.functional.dropout(x=input, p=0., training=False)
+            res2 = paddle.nn.functional.dropout(
+                x=input, p=0., axis=0, training=True, mode='upscale_in_train')
+            res3 = paddle.nn.functional.dropout(
+                x=input, p=0., axis=0, training=True, mode='downscale_in_infer')
+            res4 = paddle.nn.functional.dropout(
+                x=input, p=0., axis=0, training=False, mode='upscale_in_train')
+            res5 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=0,
+                training=False,
+                mode='downscale_in_infer')
+            res6 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=True,
+                mode='upscale_in_train')
+            res7 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=True,
+                mode='downscale_in_infer')
+            res8 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=False,
+                mode='upscale_in_train')
+            res9 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=False,
+                mode='downscale_in_infer')
+            res10 = paddle.nn.functional.dropout(x=input, p=1., training=True)
+
+            in_np = np.random.random([40, 40]).astype("float32")
+            res_np = in_np
+            res_np2 = np.zeros_like(in_np)
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2, res3, res4, res5, res6, res7, res8, res9]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+            fetches2 = exe.run(fluid.default_main_program(),
+                               feed={"input": in_np},
+                               fetch_list=[res10])
+            self.assertTrue(np.allclose(fetches2[0], res_np2))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in_np = np.random.random([40, 40]).astype("float32")
+                res_np = in_np
+                res_np2 = np.zeros_like(in_np)
+                input = fluid.dygraph.to_variable(in_np)
+
+                res1 = paddle.nn.functional.dropout(
+                    x=input, p=0., training=False)
+                res2 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=0,
+                    training=True,
+                    mode='upscale_in_train')
+                res3 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=0,
+                    training=True,
+                    mode='downscale_in_infer')
+                res4 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=0,
+                    training=False,
+                    mode='upscale_in_train')
+                res5 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=0,
+                    training=False,
+                    mode='downscale_in_infer')
+                res6 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=[0, 1],
+                    training=True,
+                    mode='upscale_in_train')
+                res7 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=[0, 1],
+                    training=True,
+                    mode='downscale_in_infer')
+                res8 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=[0, 1],
+                    training=False,
+                    mode='upscale_in_train')
+                res9 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=[0, 1],
+                    training=False,
+                    mode='downscale_in_infer')
+                res10 = paddle.nn.functional.dropout(
+                    x=input, p=1., training=True)
+
+            res_list = [res1, res2, res3, res4, res5, res6, res7, res8, res9]
+            for res in res_list:
+                self.assertTrue(np.allclose(res.numpy(), res_np))
+            self.assertTrue(np.allclose(res10.numpy(), res_np2))
+
+
+class TestDropoutFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_Variable():
+                # the input of dropout must be Variable.
+                x1 = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                paddle.nn.functional.dropout(x1, p=0.5)
+
+            self.assertRaises(TypeError, test_Variable)
+
+            def test_Variable2():
+                # the input of dropout must be Variable.
+                x1 = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                paddle.nn.functional.dropout(x1, p=0.5, axis=0)
+
+            self.assertRaises(TypeError, test_Variable2)
+
+            def test_dtype():
+                # the input dtype of dropout must be float32 or float64
+                # float16 only can be set on GPU place
+                xr = fluid.data(name='xr', shape=[3, 4, 5, 6], dtype="int32")
+                paddle.nn.functional.dropout(xr, p=0.5)
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_pdtype():
+                # p should be int or float
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, p='0.5')
+
+            self.assertRaises(TypeError, test_pdtype)
+
+            def test_pvalue():
+                # p should be 0.<=p<=1.
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, p=1.2)
+
+            self.assertRaises(ValueError, test_pvalue)
+
+            def test_mode():
+                # mode should be 'downscale_in_infer' or 'upscale_in_train'
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, mode='abc')
+
+            self.assertRaises(ValueError, test_mode)
+
+            def test_axis():
+                # axis should be int or list
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, axis=1.2)
+
+            self.assertRaises(TypeError, test_axis)
+
+            def test_axis_max():
+                # maximum of axis should less than dimensions of x
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, axis=[0, 5])
+
+            self.assertRaises(ValueError, test_axis_max)
+
+            def test_axis_len():
+                # length of axis should not greater than dimensions of x
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, axis=[0, 1, 2, 3, 4])
+
+            self.assertRaises(ValueError, test_axis_len)
+
+
+class TestDropoutCAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.random.random([40, 40]).astype("float32")
+                result_np = input_np
+                input = fluid.dygraph.to_variable(input_np)
+                m = paddle.nn.Dropout(p=0.)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), result_np))
+
+
+class TestDropout2dFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 4, 5], dtype="float32")
+            res1 = paddle.nn.functional.dropout2d(
+                x=input, p=0., training=False, data_format='NCHW')
+            res2 = paddle.nn.functional.dropout2d(
+                x=input, p=0., training=False, data_format='NHWC')
+
+            in_np = np.random.random([2, 3, 4, 5]).astype("float32")
+            res_np = in_np
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in_np = np.random.random([2, 3, 4, 5]).astype("float32")
+                res_np = in_np
+                input = fluid.dygraph.to_variable(in_np)
+
+                res1 = paddle.nn.functional.dropout2d(
+                    x=input, p=0., training=False, data_format='NCHW')
+                res2 = paddle.nn.functional.dropout2d(
+                    x=input, p=0., training=False, data_format='NHWC')
+
+            res_list = [res1, res2]
+            for res in res_list:
+                self.assertTrue(np.allclose(res.numpy(), res_np))
+
+
+class TestDropout2dFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_xdim():
+                # dimentions of x should be 4
+                x = fluid.data(name='x1', shape=[2, 3, 4, 5, 6], dtype="int32")
+                paddle.nn.functional.dropout2d(x)
+
+            self.assertRaises(ValueError, test_xdim)
+
+            def test_dataformat():
+                # data_format should be 'NCHW' or 'NHWC'
+                x = fluid.data(name='x2', shape=[2, 3, 4, 5], dtype="int32")
+                paddle.nn.functional.dropout2d(x, data_format='CNHW')
+
+            self.assertRaises(ValueError, test_dataformat)
+
+
+class TestDropout2DCAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.random.random([2, 3, 4, 5]).astype("float32")
+                result_np = input_np
+                input = fluid.dygraph.to_variable(input_np)
+                m = paddle.nn.Dropout2D(p=0.)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), result_np))
+
+
+class TestDropout3dFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 4, 5, 6], dtype="float32")
+            res1 = paddle.nn.functional.dropout3d(
+                x=input, p=0., training=False, data_format='NCDHW')
+            res2 = paddle.nn.functional.dropout3d(
+                x=input, p=0., training=False, data_format='NDHWC')
+
+            in_np = np.random.random([2, 3, 4, 5, 6]).astype("float32")
+            res_np = in_np
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in_np = np.random.random([2, 3, 4, 5, 6]).astype("float32")
+                res_np = in_np
+                input = fluid.dygraph.to_variable(in_np)
+
+                res1 = paddle.nn.functional.dropout3d(
+                    x=input, p=0., training=False, data_format='NCDHW')
+                res2 = paddle.nn.functional.dropout3d(
+                    x=input, p=0., training=False, data_format='NDHWC')
+
+            res_list = [res1, res2]
+            for res in res_list:
+                self.assertTrue(np.allclose(res.numpy(), res_np))
+
+
+class TestDropout3dFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_xdim():
+                # dimentions of x should be 5
+                x = fluid.data(name='x1', shape=[2, 3, 4, 5], dtype="int32")
+                paddle.nn.functional.dropout3d(x)
+
+            self.assertRaises(ValueError, test_xdim)
+
+            def test_dataformat():
+                # data_format should be 'NCDHW' or 'NDHWC'
+                x = fluid.data(name='x2', shape=[2, 3, 4, 5, 6], dtype="int32")
+                paddle.nn.functional.dropout3d(x, data_format='CNDHW')
+
+            self.assertRaises(ValueError, test_dataformat)
+
+
+class TestDropout3DCAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.random.random([2, 3, 4, 5, 6]).astype("float32")
+                result_np = input_np
+                input = fluid.dygraph.to_variable(input_np)
+                m = paddle.nn.Dropout3D(p=0.)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), result_np))
+
+
+class TestAlphaDropoutFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[40, 40], dtype="float32")
+            res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
+            res2 = paddle.nn.functional.alpha_dropout(
+                x=input, p=0., training=False)
+
+            in_np = np.random.random([40, 40]).astype("float32")
+            res_np = in_np
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in_np = np.random.random([40, 40]).astype("float32")
+                res_np = in_np
+                input = fluid.dygraph.to_variable(in_np)
+
+                res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
+                res2 = paddle.nn.functional.alpha_dropout(
+                    x=input, p=0., training=False)
+
+            res_list = [res1, res2]
+            for res in res_list:
+                self.assertTrue(np.allclose(res.numpy(), res_np))
+
+
+class TestAlphaDropoutFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_Variable():
+                # the input of dropout must be Variable.
+                x1 = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                paddle.nn.functional.alpha_dropout(x1, p=0.5)
+
+            self.assertRaises(TypeError, test_Variable)
+
+            def test_dtype():
+                # the input dtype of dropout must be float32 or float64
+                xr = fluid.data(name='xr', shape=[3, 4, 5, 6], dtype="int32")
+                paddle.nn.functional.alpha_dropout(xr)
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_pdtype():
+                # p should be int or float
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.alpha_dropout(x2, p='0.5')
+
+            self.assertRaises(TypeError, test_pdtype)
+
+            def test_pvalue():
+                # p should be 0.<=p<=1.
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.alpha_dropout(x2, p=1.2)
+
+            self.assertRaises(ValueError, test_pvalue)
+
+
+class TestAlphaDropoutCAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.random.random([40, 40]).astype("float32")
+                result_np = input_np
+                input = fluid.dygraph.to_variable(input_np)
+                m = paddle.nn.AlphaDropout(p=0.)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), result_np))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
index ae4355ec412c87ff1d947338d4f15256ace539b8..88b496c1d89e63752f93bd2abb77c61af7e86c4d 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -27,6 +27,8 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 
+SEED = 123123111
+
 
 class SimpleImgConvPool(fluid.dygraph.Layer):
     def __init__(self,
@@ -105,12 +107,11 @@ class MNIST(fluid.dygraph.Layer):
 
 class TestDygraphMultiForward(unittest.TestCase):
     def test_mnist_forward_float32(self):
-        seed = 90
         epoch_num = 1
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
 
+        with fluid.dygraph.guard():
+            paddle.manual_seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
             mnist = MNIST()
             sgd = SGDOptimizer(
                 learning_rate=1e-3, parameter_list=mnist.parameters())
@@ -142,9 +143,8 @@ class TestDygraphMultiForward(unittest.TestCase):
                             dy_param_init_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
+            paddle.manual_seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..466226c53fabbd315acd19c6421f210d0ca225c1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
@@ -0,0 +1,183 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy
+import collections
+from functools import reduce
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.nn.utils import weight_norm, remove_weight_norm
+
+
+class TestDygraphWeightNorm(unittest.TestCase):
+    def setUp(self):
+        self.init_test_case()
+        self.set_data()
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = None
+
+    def set_data(self):
+        self.data = collections.OrderedDict()
+        for desc in self.data_desc:
+            data_name = desc[0]
+            data_shape = desc[1]
+            data_value = numpy.random.random(
+                size=[self.batch_size] + data_shape).astype('float32')
+            self.data[data_name] = data_value
+
+    def norm_except_dim(self, w, dim=None):
+        shape = w.shape
+        ndims = len(shape)
+        shape_numel = reduce(lambda x, y: x * y, shape)
+        if dim == -1:
+            return numpy.linalg.norm(w, axis=None, keepdims=True)
+        elif dim == 0:
+            tile_shape = list(w.shape)
+            tile_shape[0] = 1
+            w_matrix = numpy.reshape(w, (shape[0], shape_numel // shape[0]))
+            return numpy.linalg.norm(w_matrix, axis=1, keepdims=True)
+        elif dim == (ndims - 1):
+            w_matrix = numpy.reshape(w, (shape_numel // shape[-1], shape[-1]))
+            return numpy.linalg.norm(w_matrix, axis=0, keepdims=True)
+        else:
+            perm = list(range(ndims))
+            perm_ori = list(range(ndims))
+            perm[0] = dim
+            perm[dim] = 0
+            p_transposed = numpy.transpose(w, perm)
+            return self.norm_except_dim(p_transposed, 0)
+
+    def weight_normalize(self, w, dim=None):
+        shape = w.shape
+        ndims = len(shape)
+        shape_numel = reduce(lambda x, y: x * y, shape)
+        v = w
+        g = self.norm_except_dim(w, dim)
+        g_mul = g
+
+        if dim == -1:
+            v_norm = v / (numpy.linalg.norm(v, axis=None, keepdims=True))
+        elif dim == 0:
+            w_matrix = numpy.reshape(w, (shape[0], shape_numel // shape[0]))
+            v_norm = v / numpy.linalg.norm(w_matrix, axis=1)
+            v_norm = numpy.reshape(v_norm, shape)
+            g = numpy.squeeze(g, axis=1)
+        elif dim == (ndims - 1):
+            w_matrix = numpy.reshape(w, (shape_numel // shape[-1], shape[-1]))
+            v_norm = v / numpy.linalg.norm(w_matrix, axis=0, keepdims=True)
+            v_norm = numpy.reshape(v_norm, shape)
+        else:
+            perm = list(range(ndims))
+            perm[0] = dim
+            perm[dim] = 0
+            p_transposed = numpy.transpose(v, perm)
+            transposed_shape = p_transposed.shape
+            transposed_shape_numel = reduce(lambda x, y: x * y,
+                                            transposed_shape)
+            p_matrix = numpy.reshape(
+                p_transposed, (p_transposed.shape[0],
+                               transposed_shape_numel // p_transposed.shape[0]))
+            v_norm = v / numpy.expand_dims(
+                numpy.expand_dims(
+                    numpy.linalg.norm(
+                        p_matrix, axis=1, keepdims=True), axis=0),
+                axis=(ndims - 1))
+            v_norm = numpy.reshape(v_norm, transposed_shape)
+            v_norm = numpy.transpose(v_norm, perm)
+            g = numpy.squeeze(g, axis=1)
+            if dim == 1:
+                eaxis = 2
+            elif dim == 2:
+                eaxis = 1
+            g_mul = numpy.expand_dims(
+                numpy.expand_dims(
+                    numpy.expand_dims(
+                        g, axis=0), axis=eaxis),
+                axis=(ndims - 1))
+        w = g_mul * v_norm
+        return g, v
+
+    def test_check_output(self):
+        fluid.enable_imperative()
+        linear = paddle.nn.Conv2d(2, 3, 3)
+        before_weight = linear.weight.numpy()
+        if self.dim == None:
+            self.dim = -1
+        wn = weight_norm(linear, dim=self.dim)
+        outputs = []
+        for name, data in self.data.items():
+            output = linear(fluid.dygraph.to_variable(data))
+            outputs.append(output.numpy())
+        after_weight = linear.weight
+        self.actual_outputs = [linear.weight_g.numpy(), linear.weight_v.numpy()]
+
+        expect_output = self.weight_normalize(before_weight, self.dim)
+
+        for expect, actual in zip(expect_output, self.actual_outputs):
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual), expect, atol=0.001))
+
+
+class TestDygraphWeightNormCase1(TestDygraphWeightNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = 0
+
+
+class TestDygraphWeightNormCase2(TestDygraphWeightNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = 1
+
+
+class TestDygraphWeightNormCase3(TestDygraphWeightNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = 3
+
+
+class TestDygraphRemoveWeightNorm(unittest.TestCase):
+    def setUp(self):
+        self.init_test_case()
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = None
+
+    def test_check_output(self):
+        fluid.enable_imperative()
+        linear = paddle.nn.Conv2d(2, 3, 3)
+        before_weight = linear.weight
+        wn = weight_norm(linear, dim=self.dim)
+        rwn = remove_weight_norm(linear)
+        after_weight = linear.weight
+        self.assertTrue(
+            numpy.allclose(
+                before_weight.numpy(), after_weight.numpy(), atol=0.001))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
index 6c0bb97bf6f14b2cef5050a99c02c7843b86be92..e0c0277270b406bf745a5293e9c1bcbe158e52d0 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -18,6 +18,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
@@ -465,9 +466,9 @@ class PaddingRNNTestBase(unittest.TestCase):
         pass
 
     def _prepare_program(self, config, parallel=True):
+        paddle.manual_seed(config.random_seed)
         self.main_program = fluid.Program()
         self.startup_program = fluid.Program()
-        self.startup_program.random_seed = config.random_seed
         with fluid.program_guard(self.main_program, self.startup_program):
             with fluid.unique_name.guard():
                 res_vars = lm_model(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index de0fc591b664728387ccb988f3611fe034989627..9ebaf8ff9438be8c8a57815be0798b861d05caaf 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -240,25 +240,124 @@ class TestElementwiseDivBroadcast(unittest.TestCase):
             self.assertEqual((out_result == (2 / x)).all(), True)
 
 
-class TestDivOp(unittest.TestCase):
-    def test_name(self):
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
-            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
+class TestDivideAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.set_default_dtype("float64")
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        # rule 1
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = np.array([1, 2, 3])
+            self.assertRaises(TypeError, paddle.divide, x=x, y=y)
+
+        # rule 2: both the inputs are not Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = 2
+            y = 4
+            res = paddle.divide(x, y)
+            exe = fluid.Executor(place)
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={},
+                           fetch_list=[res])
+            self.assertEqual(np_z[0] == 0.5, True)
+
+        # rule 3: 
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float32")
+            self.assertRaises(TypeError, paddle.divide, x=x, y=y)
+
+        # rule 4: x is Tensor, y is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = x / y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 3, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([1., 1.5, 2.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 5: y is Tensor, x is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = y / x
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 8, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([1., 0.25, 0.5])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 6: y is Tensor, x is Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float64")
+            exe = fluid.Executor(place)
+            res = x / y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={
+                               "x": np.array([2, 3, 4]).astype('float64'),
+                               "y": np.array([1, 5, 2]).astype('float64')
+                           },
+                           fetch_list=[res])
+            z_expected = np.array([2., 0.6, 2.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
 
-            y_1 = paddle.div(x, y, name='div_res')
-            self.assertEqual(('div_res' in y_1.name), True)
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
 
     def test_dygraph(self):
-        with fluid.dygraph.guard():
-            np_x = np.array([2, 3, 4]).astype('float64')
-            np_y = np.array([1, 5, 2]).astype('float64')
-            x = fluid.dygraph.to_variable(np_x)
-            y = fluid.dygraph.to_variable(np_y)
-            z = paddle.div(x, y)
-            np_z = z.numpy()
-            z_expected = np.array([2., 0.6, 2.])
-            self.assertEqual((np_z == z_expected).all(), True)
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                # rule 1 : avoid numpy.ndarray
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                self.assertRaises(TypeError, paddle.divide, x=x, y=np_y)
+
+                # rule 2: both the inputs are not Tensor
+                z = paddle.divide(3, 2)
+                self.assertEqual(z.numpy()[0] == 1.5, True)
+
+                # rule 3: both the inputs are Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x, dtype="float32")
+                y = paddle.to_tensor(np_y, dtype="float64")
+                self.assertRaises(TypeError, paddle.divide, x=x, y=y)
+
+                # rule 4: x is Tensor, y is scalar
+                np_x = np.array([2, 3, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = x / y
+                z_expected = np.array([1., 1.5, 2.])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 5: y is Tensor, x is scalar
+                np_x = np.array([2, 1, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = y / x
+                z_expected = np.array([1., 2., 0.5])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 6: y is Tensor, x is Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x / y
+                z_expected = np.array([2., 0.6, 2.])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
index 104e896b6e440f5657a90e0ce741b49f72ba75c6..0b6acc7615395ed99a484e0e56f9c62447a1f345 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -15,6 +15,8 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
+import paddle.fluid as fluid
 import paddle.fluid.core as core
 from op_test import OpTest
 
@@ -56,6 +58,13 @@ class TestElementwiseModOp(OpTest):
         pass
 
 
+class TestElementwiseModOpInverse(TestElementwiseModOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+
 class TestElementwiseModOp_scalar(TestElementwiseModOp):
     def init_input_output(self):
         scale_x = random.randint(0, 100000000)
@@ -65,5 +74,146 @@ class TestElementwiseModOp_scalar(TestElementwiseModOp):
         self.out = np.floor_divide(self.x, self.y)
 
 
+class TestFloorDivideAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.set_default_dtype("float64")
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        # rule 1
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = np.array([1, 2, 3])
+            self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
+
+        # rule 2: both the inputs are not Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = 2
+            y = 4
+            res = paddle.floor_divide(x, y)
+            exe = fluid.Executor(place)
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={},
+                           fetch_list=[res])
+            self.assertEqual(np_z[0] == 0., True)
+
+        # rule 3: 
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float32")
+            self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
+
+        # rule 4: x is Tensor, y is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = x // y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 3, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([1., 1., 2.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 5: y is Tensor, x is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = y // x
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 8, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([1., 0., 0.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 6: y is Tensor, x is Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float64")
+            exe = fluid.Executor(place)
+            res = x // y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={
+                               "x": np.array([2, 3, 4]).astype('float64'),
+                               "y": np.array([1, 5, 2]).astype('float64')
+                           },
+                           fetch_list=[res])
+            z_expected = np.array([2., 0., 2.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                # rule 1 : avoid numpy.ndarray
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                self.assertRaises(TypeError, paddle.floor_divide, x=x, y=np_y)
+
+                # rule 2: both the inputs are not Tensor
+                z = paddle.floor_divide(3, 2)
+                self.assertEqual(z.numpy()[0] == 1., True)
+
+                # rule 3: both the inputs are Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x, dtype="float32")
+                y = paddle.to_tensor(np_y, dtype="float64")
+                self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
+
+                # rule 4: x is Tensor, y is scalar
+                np_x = np.array([2, 3, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = x // y
+                z_expected = np.array([1, 1, 2])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 5: y is Tensor, x is scalar
+                np_x = np.array([2, 1, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = y // x
+                z_expected = np.array([1, 2, 0])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 6: y is Tensor, x is Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x // y
+                z_expected = np.array([2., 0., 2.])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            # divide by zero 
+            np_x = np.array([2, 3, 4])
+            np_y = np.array([0])
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            try:
+                z = x // y
+            except Exception as e:
+                print("Error: Divide by zero encounter in floor_divide\n")
+
+            # divide by zero 
+            np_x = np.array([2])
+            np_y = np.array([0, 0, 0])
+            x = paddle.to_tensor(np_x, dtype="int32")
+            y = paddle.to_tensor(np_y, dtype="int32")
+            try:
+                z = x // y
+            except Exception as e:
+                print("Error: Divide by zero encounter in floor_divide\n")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
index 2c0fdf51769782e046b1b18ebd31782c81fd49f0..f5d8b4f704da8acd97475444346522f63d3724fd 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
@@ -15,6 +15,8 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
+import paddle.fluid as fluid
 import paddle.fluid.core as core
 from op_test import OpTest
 
@@ -82,5 +84,142 @@ class TestElementwiseModOpDouble(TestElementwiseModOpFloat):
         self.dtype = np.float64
 
 
+class TestRemainderAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.set_default_dtype("float64")
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        # rule 1
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = np.array([1, 2, 3])
+            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
+
+        # rule 3: 
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float32")
+            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
+
+        # rule 4: x is Tensor, y is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = x % y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 3, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([0., 1., 0.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 5: y is Tensor, x is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = 3
+            y = fluid.data(name="y", shape=[3], dtype="float32")
+            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
+
+        # rule 6: y is Tensor, x is Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[1], dtype="float64")
+            exe = fluid.Executor(place)
+            res = x % y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={
+                               "x": np.array([1., 2., 4]).astype('float64'),
+                               "y": np.array([1.5]).astype('float64')
+                           },
+                           fetch_list=[res])
+            z_expected = np.array([1., 0.5, 1.0])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 6: y is Tensor, x is Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[6], dtype="float64")
+            y = fluid.data(name="y", shape=[1], dtype="float64")
+            exe = fluid.Executor(place)
+            res = x % y
+            np_z = exe.run(
+                fluid.default_main_program(),
+                feed={
+                    "x": np.array([-3., -2, -1, 1, 2, 3]).astype('float64'),
+                    "y": np.array([2]).astype('float64')
+                },
+                fetch_list=[res])
+            z_expected = np.array([1., 0., 1., 1., 0., 1.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                # rule 1 : avoid numpy.ndarray
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                self.assertRaises(TypeError, paddle.remainder, x=x, y=np_y)
+
+                # rule 3: both the inputs are Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x, dtype="float32")
+                y = paddle.to_tensor(np_y, dtype="float64")
+                self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
+
+                # rule 4: x is Tensor, y is scalar
+                np_x = np.array([2, 3, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = x % y
+                z_expected = np.array([0, 1, 0])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 5: y is Tensor, x is scalar
+                np_x = np.array([2, 3, 4])
+                x = paddle.to_tensor(np_x)
+                self.assertRaises(TypeError, paddle.remainder, x=3, y=x)
+
+                # rule 6: y is Tensor, x is Tensor
+                np_x = np.array([1., 2., 4])
+                np_y = np.array([1.5])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x % y
+                z_expected = np.array([1., 0.5, 1.0])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 6: y is Tensor, x is Tensor
+                np_x = np.array([-3., -2, -1, 1, 2, 3])
+                np_y = np.array([2.])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x % y
+                z_expected = np.array([1., 0., 1., 1., 0., 1.])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                np_x = np.array([-3.3, 11.5, -2, 3.5])
+                np_y = np.array([-1.2, 2., 3.3, -2.3])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x % y
+                z_expected = np.array([-0.9, 1.5, 1.3, -1.1])
+                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+
+                np_x = np.array([-3, 11, -2, 3])
+                np_y = np.array([-1, 2, 3, -2])
+                x = paddle.to_tensor(np_x, dtype="int64")
+                y = paddle.to_tensor(np_y, dtype="int64")
+                z = x % y
+                z_expected = np.array([0, 1, 1, -1])
+                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
index e86f18a62167b7feab1549072fc296f847c00491..12b75c8bf703d2b31e6abb08bb233fb2874828ce 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
@@ -29,7 +29,7 @@ class TestElementwiseMulDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -56,7 +56,7 @@ class TestElementwiseMulBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -83,7 +83,7 @@ class TestElementwiseAddDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -110,7 +110,7 @@ class TestElementwiseAddBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -137,7 +137,7 @@ class TestElementwiseSubDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -164,7 +164,7 @@ class TestElementwiseSubBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -191,7 +191,7 @@ class TestElementwiseDivDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.0001
         dtype = np.float64
 
@@ -219,7 +219,7 @@ class TestElementwiseDivBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.0001
         dtype = np.float64
 
diff --git a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
index 5a562dc14650a74ee6f76fa3d8c5f207da6475d6..c18b7c5b044e76fdfbb53b68633cfaf222190a38 100644
--- a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import six
 import unittest
@@ -37,13 +38,13 @@ class TestEmbeddingIdStopGradientBase(unittest.TestCase):
             self.assertTrue(np.array_equal(grad_value1, grad_value2))
 
     def run_program(self, place, stop_gradient=False):
+        np.random.seed(1)
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+
         startup_program = fluid.Program()
         main_program = fluid.Program()
 
-        np.random.seed(1)
-        startup_program.random_seed = 1
-        main_program.random_seed = 1
-
         scope = fluid.Scope()
         with fluid.program_guard(main_program, startup_program):
             with fluid.scope_guard(scope):
diff --git a/python/paddle/fluid/tests/unittests/test_erf_op.py b/python/paddle/fluid/tests/unittests/test_erf_op.py
index 93ab0212f136adfedacb52a2fde47e15edf279d3..964e704c6a2ccbdc96fc281f6e417caf8351cdf7 100644
--- a/python/paddle/fluid/tests/unittests/test_erf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_erf_op.py
@@ -19,6 +19,7 @@ import numpy as np
 from scipy.special import erf
 from op_test import OpTest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.dygraph as dg
 
@@ -58,6 +59,12 @@ class TestErfLayer(unittest.TestCase):
         if fluid.is_compiled_with_cuda():
             self._test_case(fluid.CUDAPlace(0))
 
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = paddle.static.data('x', [3, 4])
+            y = paddle.erf(x, name='erf')
+            self.assertTrue('erf' in y.name)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
index 0ccb725870cdeb2c01236e9d8041e6b308cdbc71..4bc6bf3744f26cf7618d255f306bdb8f5fefb7a0 100755
--- a/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
@@ -93,8 +93,19 @@ class TestExpandAsOpRank4(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestExpandAsV2Error(unittest.TestCase):
+    def test_errors(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x1 = fluid.layers.data(name='x1', shape=[4], dtype="uint8")
+            x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
+            self.assertRaises(TypeError, paddle.tensor.expand_as, x1, x2)
+            x3 = fluid.layers.data(name='x3', shape=[4], dtype="bool")
+            x3.stop_gradient = False
+            self.assertRaises(ValueError, paddle.tensor.expand_as, x3, x2)
+
+
 # Test python API
-class TestExpandAPI(unittest.TestCase):
+class TestExpandAsV2API(unittest.TestCase):
     def test_api(self):
         input1 = np.random.random([12, 14]).astype("float32")
         input2 = np.random.random([2, 12, 14]).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
index e5a7e6c702aec114968adb7fb23309c9d944559d..ec30cb70c579092b1ee03b9debc9a26dcb19926e 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import unittest
+import paddle
 import numpy as np
 from op_test import OpTest
 import paddle.fluid as fluid
@@ -135,31 +136,32 @@ class TestFCOpWithPadding(TestFCOp):
 
 class TestFcOp_NumFlattenDims_NegOne(unittest.TestCase):
     def test_api(self):
-        startup_program = Program()
-        main_program = Program()
-        startup_program.random_seed = SEED
-        main_program.random_seed = SEED
-
-        with program_guard(main_program, startup_program):
-            input = np.random.random([2, 2, 25]).astype("float32")
-            x = fluid.layers.data(
-                name="x",
-                shape=[2, 2, 25],
-                append_batch_size=False,
-                dtype="float32")
-
-            out_1 = fluid.layers.fc(input=x, size=1, num_flatten_dims=-1)
-            out_2 = fluid.layers.fc(input=x, size=1, num_flatten_dims=2)
-
-        place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
-        exe = fluid.Executor(place=place)
-        exe.run(startup_program)
-        res_1, res_2 = exe.run(main_program,
-                               feed={"x": input},
-                               fetch_list=[out_1, out_2])
-
-        assert np.array_equal(res_1, res_2)
+        def run_program(num_flatten_dims):
+            paddle.manual_seed(SEED)
+            startup_program = Program()
+            main_program = Program()
+
+            with program_guard(main_program, startup_program):
+                input = np.random.random([2, 2, 25]).astype("float32")
+                x = fluid.layers.data(
+                    name="x",
+                    shape=[2, 2, 25],
+                    append_batch_size=False,
+                    dtype="float32")
+
+                out = fluid.layers.fc(input=x,
+                                      size=1,
+                                      num_flatten_dims=num_flatten_dims)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = fluid.Executor(place=place)
+            exe.run(startup_program)
+            out = exe.run(main_program, feed={"x": input}, fetch_list=[out])
+
+        res_1 = run_program(-1)
+        res_2 = run_program(2)
+        self.assertTrue(np.array_equal(res_1, res_2))
 
 
 class TestFCOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
index 4bd56802efd462ff63498f54699012776ce6f47c..642044bb4b1152b0c6d2b5a8a64e22410f9bd151 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
@@ -145,19 +145,22 @@ class TestFlatten2OpError(unittest.TestCase):
         x = x.astype('float32')
 
         def test_ValueError1():
-            x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(
+                name="x", shape=image_shape, dtype='float32')
             out = paddle.flatten(x_var, start_axis=2, stop_axis=1)
 
         self.assertRaises(ValueError, test_ValueError1)
 
         def test_ValueError2():
-            x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(
+                name="x", shape=image_shape, dtype='float32')
             paddle.flatten(x_var, start_axis=10, stop_axis=1)
 
         self.assertRaises(ValueError, test_ValueError2)
 
         def test_ValueError3():
-            x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(
+                name="x", shape=image_shape, dtype='float32')
             paddle.flatten(x_var, start_axis=2, stop_axis=10)
 
         self.assertRaises(ValueError, test_ValueError3)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
index 22a1434ae251a920182226e20758ba3c925b0a75..38c3903306e6e76188cdb50476d6797814c434e9 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -51,7 +51,7 @@ class TestFleetAMPOptimizer(unittest.TestCase):
             "custom_black_list": ['tanh'],
         }
 
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index ca657a5a619b66b31fcb6dc64b48040ed9f549ae..9e651dea24ba7f35f3785093da8ac73dde07be5a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -17,6 +17,7 @@ import paddle
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 import os
+import paddle.fluid as fluid
 
 
 class TestFleetBase(unittest.TestCase):
@@ -119,24 +120,9 @@ class TestFleetBase(unittest.TestCase):
         optimizer = paddle.optimizer.SGD(learning_rate=0.001)
         optimizer = fleet.distributed_optimizer(optimizer)
 
-    def test_minimize(self):
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        strategy = fleet.DistributedStrategy()
-        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+    def test_exception(self):
+        import paddle.distributed.fleet as fleet
+        self.assertRaises(Exception, fleet.init_worker)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d666ea6740be149723e3bdbc00857a8931ce318e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.fluid as fluid
+
+
+class TestFleetBase(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_ps_minimize(self):
+        import paddle
+        import paddle.distributed.fleet as fleet
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        role = fleet.PaddleCloudRoleMaker(is_collective=False)
+        fleet.init(role)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = False
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        pe = fluid.ParallelExecutor(use_cuda=False, loss_name=avg_cost.name)
+        compiled_prog = fluid.compiler.CompiledProgram(
+            fluid.default_main_program())
+        self.assertRaises(
+            Exception,
+            fleet.save_inference_model,
+            dirname='/tmp/',
+            feeded_var_names=['x', 'y'],
+            target_vars=[avg_cost],
+            executor=pe)
+
+        self.assertRaises(
+            Exception,
+            fleet.save_inference_model,
+            dirname='/tmp/',
+            feeded_var_names=['x', 'y'],
+            target_vars=[avg_cost],
+            executor="exe")
+
+        self.assertRaises(
+            Exception,
+            fleet.save_inference_model,
+            dirname='/tmp/',
+            feeded_var_names=['x', 'y'],
+            target_vars=[avg_cost],
+            executor=exe,
+            main_program=compiled_prog)
+
+        self.assertRaises(
+            Exception, fleet.save_persistables, executor=pe, dirname='/tmp/')
+
+        self.assertRaises(
+            Exception, fleet.save_persistables, executor="exe", dirname='/tmp/')
+
+        self.assertRaises(
+            Exception,
+            fleet.save_persistables,
+            executor=exe,
+            dirname='/tmp/',
+            main_program=compiled_prog)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e888ab0eb3ca597bf62245ff9f3024fe81ee95
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import paddle
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.fluid as fluid
+
+
+class TestFleetBase(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_collective_minimize(self):
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        strategy = fleet.DistributedStrategy()
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_4.py b/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b3fbb86a4af55d6838df3a628bf2cf194c5235d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.fluid as fluid
+
+
+class TestFleetBase(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_fleet_init(self):
+        import paddle.distributed.fleet as fleet
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        role = fleet.PaddleCloudRoleMaker(is_collective=False)
+        fleet.init(role)
+        fleet.init()
+        fleet.init(is_collective=False)
+        self.assertRaises(Exception, fleet.init, is_collective="F")
+        self.assertRaises(Exception, fleet.init, role_maker="F")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
index b43687ce1cdabaf8918bed94173325a9d7cfe6e3..55d4ff7726aace09e486156d26efdecf22b310a5 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
@@ -60,7 +60,8 @@ class TestFleetDGCOptimizer(unittest.TestCase):
         startup_prog = fluid.Program()
         train_prog = fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -72,7 +73,7 @@ class TestFleetDGCOptimizer(unittest.TestCase):
         startup_prog = fluid.Program()
         train_prog = fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.optimizer.Adam(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -87,7 +88,8 @@ class TestFleetDGCOptimizer(unittest.TestCase):
         startup_prog = fluid.Program()
         train_prog = fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 40e0168e1ac93dfd93a99c19eced05756a49471f..8d715674cc6c9ba4f8b5c1ff4fe0cbdbe7841643 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -294,6 +294,28 @@ class TestStrategyConfig(unittest.TestCase):
         with self.assertRaises(TypeError):
             strategy.unknown_key = 'UNK'
 
+    def test_cudnn_exhaustive_search(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.cudnn_exhaustive_search = False
+        self.assertEqual(strategy.cudnn_exhaustive_search, False)
+        strategy.cudnn_exhaustive_search = "True"
+        self.assertEqual(strategy.cudnn_exhaustive_search, False)
+
+    def test_cudnn_batchnorm_spatial_persistent(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.cudnn_batchnorm_spatial_persistent = False
+        self.assertEqual(strategy.cudnn_batchnorm_spatial_persistent, False)
+        strategy.cudnn_batchnorm_spatial_persistent = "True"
+        self.assertEqual(strategy.cudnn_batchnorm_spatial_persistent, False)
+
+    def test_conv_workspace_size_limit(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.conv_workspace_size_limit = 1000
+        self.assertEqual(strategy.conv_workspace_size_limit, 1000)
+        strategy.conv_workspace_size_limit = "400"
+        self.assertEqual(strategy.conv_workspace_size_limit, 1000)
+        strategy._enable_env()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
index 581f8becbbff1945603330221014ca4ce6d51a86..af72df5186876a8bcbaa5bfa6d71a27fdf46b119 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
@@ -44,7 +44,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.gradient_merge = True
         strategy.gradient_merge_configs = {"k_steps": 2, "avg": True}
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
index 3e97ab3bfc66ca5becfba2ed94ce94504d100936..9eec73116cc283b58d3ee39cefb9256e12d4ef15 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
@@ -15,34 +15,44 @@
 import unittest
 import paddle
 import os
-from launch_function_helper import launch_func
+from launch_function_helper import launch_func, wait, _find_free_port
 
 
 class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        try:
+            self._dist_ut_port_0 = int(os.environ["PADDLE_DIST_UT_PORT"])
+            self._dist_ut_port_1 = self._dist_ut_port_0 + 1
+        except Exception as e:
+            self._dist_ut_port_0 = _find_free_port(set())
+            self._dist_ut_port_1 = _find_free_port(set())
+
     def test_graph_execution_optimizer_not_apply(self):
+        port_a = self._dist_ut_port_0
+        port_b = self._dist_ut_port_1
         node_a = {
             "PADDLE_TRAINER_ID": "0",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36003",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
 
         node_b = {
             "PADDLE_TRAINER_ID": "1",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36004",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
 
         def node_func():
             import paddle.distributed.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-            fleet.init(role)
+            fleet.init(is_collective=True)
             input_x = paddle.fluid.layers.data(
                 name="x", shape=[32], dtype='float32')
             input_y = paddle.fluid.layers.data(
@@ -58,7 +68,7 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(
                 optimizer, strategy=strategy)
             optimizer.minimize(avg_cost)
@@ -67,33 +77,35 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
         proc_a.start()
         proc_b = launch_func(node_func, node_b)
         proc_b.start()
-        proc_a.join()
-        proc_b.join()
+        wait([proc_a, proc_b])
 
     def test_graph_execution_optimizer(self):
+        port_a = self._dist_ut_port_0 + 2
+        port_b = self._dist_ut_port_1 + 2
+
         node_a = {
             "PADDLE_TRAINER_ID": "0",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36001",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
 
         node_b = {
             "PADDLE_TRAINER_ID": "1",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36002",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
 
         def node_func():
             import paddle.distributed.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-            fleet.init(role)
+            fleet.init(is_collective=True)
             input_x = paddle.fluid.layers.data(
                 name="x", shape=[32], dtype='float32')
             input_y = paddle.fluid.layers.data(
@@ -111,10 +123,131 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
             strategy.sync_nccl_allreduce = True
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(
+                optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
+            exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace())
+            exe.run(paddle.fluid.default_startup_program())
+
+            import numpy as np
+
+            def gen_data():
+                return {
+                    "x": np.random.random(size=(128, 32)).astype('float32'),
+                    "y": np.random.randint(
+                        2, size=(128, 1)).astype('int64')
+                }
+
+            for i in range(10):
+                cost_val = exe.run(feed=gen_data(), fetch_list=[avg_cost.name])
+                print("cost of step[{}] = {}".format(i, cost_val))
+
+        proc_a = launch_func(node_func, node_a)
+        proc_a.start()
+        proc_b = launch_func(node_func, node_b)
+        proc_b.start()
+        wait([proc_a, proc_b])
+
+    def test_graph_execution_optimizer_not_apply_v2(self):
+        port_a = self._dist_ut_port_0 + 4
+        port_b = self._dist_ut_port_1 + 4
+        node_a = {
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
+            "http_proxy": "",
+            "https_proxy": ""
+        }
+
+        node_b = {
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
+            "http_proxy": "",
+            "https_proxy": ""
+        }
+
+        def node_func():
+            import paddle.distributed.fleet as fleet
+            fleet.init(is_collective=True)
+            input_x = paddle.fluid.layers.data(
+                name="x", shape=[32], dtype='float32')
+            input_y = paddle.fluid.layers.data(
+                name="y", shape=[1], dtype='int64')
+
+            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                size=2,
+                                                act='softmax')
+            cost = paddle.fluid.layers.cross_entropy(
+                input=prediction, label=input_y)
+            avg_cost = paddle.fluid.layers.mean(x=cost)
+
+            strategy = paddle.distributed.fleet.DistributedStrategy()
             optimizer = paddle.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(
                 optimizer, strategy=strategy)
             optimizer.minimize(avg_cost)
+
+        proc_a = launch_func(node_func, node_a)
+        proc_a.start()
+        proc_b = launch_func(node_func, node_b)
+        proc_b.start()
+        wait([proc_a, proc_b])
+
+    def test_graph_execution_optimizer(self):
+        port_a = self._dist_ut_port_0 + 6
+        port_b = self._dist_ut_port_1 + 6
+        node_a = {
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
+            "http_proxy": "",
+            "https_proxy": ""
+        }
+
+        node_b = {
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
+            "http_proxy": "",
+            "https_proxy": ""
+        }
+
+        def node_func():
+            import paddle.distributed.fleet as fleet
+            fleet.init(is_collective=True)
+            input_x = paddle.fluid.layers.data(
+                name="x", shape=[32], dtype='float32')
+            input_y = paddle.fluid.layers.data(
+                name="y", shape=[1], dtype='int64')
+
+            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                size=2,
+                                                act='softmax')
+            cost = paddle.fluid.layers.cross_entropy(
+                input=prediction, label=input_y)
+            avg_cost = paddle.fluid.layers.mean(x=cost)
+
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            strategy.nccl_comm_num = 2
+            strategy.sync_nccl_allreduce = True
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(
+                optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
             exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace())
             exe.run(paddle.fluid.default_startup_program())
 
@@ -135,8 +268,7 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
         proc_a.start()
         proc_b = launch_func(node_func, node_b)
         proc_b.start()
-        proc_a.join()
-        proc_b.join()
+        wait([proc_a, proc_b])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
index 4d92c6f70541d18a02c4bd179bb17e36117cffcb..69f5b134888b0f3268cea112eeefd9fb7fd0127f 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
@@ -60,7 +60,7 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
             strategy.sync_nccl_allreduce = True
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(
                 optimizer, strategy=strategy)
             optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
index 134aea363b55e1c2d0730fee083f5ef00185af61..3f140f53b043b1949572f3728ca8a0c556317783 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
@@ -62,7 +62,7 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
         startup_prog = fluid.Program()
         train_prog = fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.optimizer.Adam(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -75,7 +75,8 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
         startup_prog = fluid.Program()
         train_prog = fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.optimizer.Momentum(learning_rate=0.1, momentum=0.9)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.1, momentum=0.9)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -88,7 +89,7 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
         startup_prog = fluid.Program()
         train_prog = fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.optimizer.Adam(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
         strategy.lamb_configs = {
             'lamb_weight_decay': 0.01,
             'exclude_from_weight_decay': ['.b_0'],
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
index b15db0b12d0011b4fef663bd41aa250e2ada1539..3caa1a4eac0bf191b13e6708b1a9adffdb111ca7 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
@@ -62,7 +62,8 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
         startup_prog = fluid.Program()
         train_prog = fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -75,7 +76,7 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
         startup_prog = fluid.Program()
         train_prog = fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.optimizer.Adam(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
index 86098d42b823b7e528846e3186dcd0d345bdb018..07b988bf8752057e68925bc42f564a72d466361d 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
@@ -46,7 +46,7 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
         config['k_steps'] = 1
         strategy.localsgd_configs = config
 
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
new file mode 100755
index 0000000000000000000000000000000000000000..dfea848aadfc44c57c91c11d196eff49d57cab08
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+from paddle import fluid
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet.meta_optimizers.meta_optimizer_base import MetaOptimizerBase
+
+
+class TestFleetMetaOptimizerBase(unittest.TestCase):
+    def net(main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+                fleet.init(role)
+                input_x = paddle.fluid.layers.data(
+                    name="x", shape=[32], dtype='float32')
+                input_y = paddle.fluid.layers.data(
+                    name="y", shape=[1], dtype='int64')
+
+                fc_1 = paddle.fluid.layers.fc(input=input_x,
+                                              size=64,
+                                              act='tanh')
+                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
+                prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                    size=2,
+                                                    act='softmax')
+                cost = paddle.fluid.layers.cross_entropy(
+                    input=prediction, label=input_y)
+                avg_cost = paddle.fluid.layers.mean(x=cost)
+
+                optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+                opt = MetaOptimizerBase(optimizer)
+                opt_ops, params_grads = opt.minimize(avg_cost)
+                opt.apply_optimize(avg_cost,
+                                   paddle.static.default_startup_program(),
+                                   params_grads)
+        return None
+
+    net(fluid.default_startup_program(), fluid.default_main_program())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
index ca969bc4032b1dedc578f3b5d4ff13491e276e57..adbb1268c6f4d7b21876dacdbbb3cf453a14d0f4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
@@ -53,7 +53,7 @@ class TestFleetMetaOptimizer(unittest.TestCase):
         strategy.pipeline = True
         strategy.pipeline_configs = {'micro_batch': 2}
 
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
index 95e1c3a36025707284edd8fc99b216d3e117341d..a42010a4eaa5066821adb817e7a5df2b81bedf7c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
@@ -45,7 +45,7 @@ class TestFleetRecomputeMetaOptimizer(unittest.TestCase):
         strategy.recompute = True
         strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_0"]}
 
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_runtime.py b/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
index 3fd646f4340dc013c10411272bb5c0c7cd52b011..80109716a54e52dc6050b724046561f37020a645 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
@@ -25,6 +25,8 @@ class TestFleetRuntime(unittest.TestCase):
         base._init_server()
         base._run_server()
         base._stop_worker()
+        base._save_inference_model()
+        base._save_persistables()
 
     def test_fleet_collective_runtime(self):
         import paddle.distributed.fleet.runtime
@@ -35,6 +37,27 @@ class TestFleetRuntime(unittest.TestCase):
         collective_runtime._init_worker()
         collective_runtime._run_server()
         collective_runtime._stop_worker()
+        collective_runtime._save_inference_model()
+        collective_runtime._save_persistables()
+
+    def test_fleet_ps_runtime(self):
+        ps_runtime = paddle.distributed.fleet.runtime.ParameterServerRuntime()
+        self.assertRaises(Exception, ps_runtime._get_optimizer_status,
+                          "test_op", None)
+        reshaped_names, origin_names = ps_runtime._get_optimizer_status("adam",
+                                                                        "param")
+        self.assertTrue(
+            len(reshaped_names) == 2 and
+            reshaped_names[0] == 'param_moment1_0' and
+            reshaped_names[1] == 'param_moment2_0')
+        self.assertTrue(
+            len(origin_names) == 2 and
+            origin_names[0] == 'param_beta1_pow_acc_0' and
+            origin_names[1] == 'param_beta2_pow_acc_0')
+
+        reshaped_names, origin_names = ps_runtime._get_optimizer_status("sgd",
+                                                                        "param")
+        self.assertTrue(len(reshaped_names) == 0 and len(origin_names) == 0)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py
index 8dbf97b11239b25a7bff01f87344d80988830217..dde36e073fb20eed3b17c79a886739f59ecb185d 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -56,7 +56,7 @@ class TestFleetUtil(unittest.TestCase):
 
     def test_get_util(self):
         import paddle.distributed.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         default_util = fleet.util
@@ -72,7 +72,7 @@ class TestFleetUtil(unittest.TestCase):
             def get_user_id(self):
                 return 10
 
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         my_util = UserDefinedUtil()
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
index c43454eaaee9e3b2f9aa371453e58b009c99a52c..68be0bf5d561ef0d8fe92005dd9ddb47c21aca51 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
@@ -37,7 +37,6 @@ class TestFunctionalConv2D(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
     def prepare(self):
@@ -88,7 +87,6 @@ class TestFunctionalConv2D(TestCase):
                     param_attr=I.NumpyArrayInitializer(self.weight),
                     bias_attr=False
                     if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
                     act=self.act,
                     data_format=self.data_format)
         exe = fluid.Executor(self.place)
@@ -121,9 +119,11 @@ class TestFunctionalConv2D(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
+
         exe = fluid.Executor(self.place)
         exe.run(start)
         feed_dict = {"input": self.input, "weight": self.weight}
@@ -144,10 +144,12 @@ class TestFunctionalConv2D(TestCase):
                 padding=self.padding,
                 stride=self.stride,
                 dilation=self.dilation,
-                act=self.act,
                 groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
+                data_format=self.data_format)
+
+            if self.act == 'sigmoid':
+                y = F.sigmoid(y)
+
             out = y.numpy()
         return out
 
@@ -185,7 +187,6 @@ class TestFunctionalConv2DError(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
     def test_exception(self):
@@ -228,9 +229,7 @@ class TestFunctionalConv2DError(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
 
 
 class TestFunctionalConv2DCase2(TestFunctionalConv2D):
@@ -383,21 +382,6 @@ class TestFunctionalConv2DErrorCase4(TestFunctionalConv2DError):
         self.data_format = "NCHW"
 
 
-class TestFunctionalConv2DErrorCase6(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
-        self.data_format = "NCHW"
-
-
 class TestFunctionalConv2DErrorCase7(TestFunctionalConv2DError):
     def setUp(self):
         self.in_channels = 3
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
index 21986f1b98d869289ddb34a65316aca57c83f9d9..1fb07bf4345909deb5485a89232270336658ae8b 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
@@ -37,8 +37,6 @@ class TestFunctionalConv2D(TestCase):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
     def prepare(self):
@@ -90,8 +88,6 @@ class TestFunctionalConv2D(TestCase):
                     param_attr=I.NumpyArrayInitializer(self.weight),
                     bias_attr=False
                     if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
         exe = fluid.Executor(self.place)
         exe.run(start)
@@ -115,7 +111,7 @@ class TestFunctionalConv2D(TestCase):
                     "weight", self.weight.shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv2d_transpose(
+                y = F.conv_transpose2d(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -124,9 +120,7 @@ class TestFunctionalConv2D(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
         exe = fluid.Executor(self.place)
         exe.run(start)
         feed_dict = {"input": self.input, "weight": self.weight}
@@ -140,7 +134,7 @@ class TestFunctionalConv2D(TestCase):
             x = dg.to_variable(self.input)
             weight = dg.to_variable(self.weight)
             bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv2d_transpose(
+            y = F.conv_transpose2d(
                 x,
                 weight,
                 bias,
@@ -148,10 +142,8 @@ class TestFunctionalConv2D(TestCase):
                 padding=self.padding,
                 stride=self.stride,
                 dilation=self.dilation,
-                act=self.act,
                 groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
+                data_format=self.data_format)
             out = y.numpy()
         return out
 
@@ -189,8 +181,6 @@ class TestFunctionalConv2DError(TestCase):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
     def test_exception(self):
@@ -225,7 +215,7 @@ class TestFunctionalConv2DError(TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv2d_transpose(
+                y = F.conv_transpose2d(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -234,9 +224,7 @@ class TestFunctionalConv2DError(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
 
 
 class TestFunctionalConv2DCase2(TestFunctionalConv2D):
@@ -249,8 +237,6 @@ class TestFunctionalConv2DCase2(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -264,8 +250,6 @@ class TestFunctionalConv2DCase3(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 1
         self.no_bias = True
-        self.act = None
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -279,8 +263,6 @@ class TestFunctionalConv2DCase4(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -294,8 +276,6 @@ class TestFunctionalConv2DCase5(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -309,8 +289,6 @@ class TestFunctionalConv2DCase6(TestFunctionalConv2D):
         self.dilation = (2, 1)
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -324,8 +302,6 @@ class TestFunctionalConv2DCase7(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 4
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NHWC"
 
 
@@ -340,8 +316,6 @@ class TestFunctionalConv2DCase8(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -355,8 +329,6 @@ class TestFunctionalConv2DCase9(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -370,8 +342,6 @@ class TestFunctionalConv2DCase10(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -385,8 +355,6 @@ class TestFunctionalConv2DCase11(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -400,8 +368,6 @@ class TestFunctionalConv2DCase12(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -415,8 +381,6 @@ class TestFunctionalConv2DErrorCase2(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -430,8 +394,6 @@ class TestFunctionalConv2DErrorCase3(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -445,8 +407,6 @@ class TestFunctionalConv2DErrorCase4(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -460,23 +420,6 @@ class TestFunctionalConv2DErrorCase5(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-
-
-class TestFunctionalConv2DErrorCase6(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
         self.data_format = "NCHW"
 
 
@@ -491,8 +434,6 @@ class TestFunctionalConv2DErrorCase7(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -506,8 +447,6 @@ class TestFunctionalConv2DErrorCase8(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "not_valid"
 
 
@@ -521,8 +460,6 @@ class TestFunctionalConv2DErrorCase9(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
index 195e3812f94843f6ccdd05cbc317238765e4c06b..b413a56c07a9ce3afbe15baffbffaf92a3d42129 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
@@ -37,7 +37,6 @@ class TestFunctionalConv3D(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
     def prepare(self):
@@ -88,7 +87,6 @@ class TestFunctionalConv3D(TestCase):
                     param_attr=I.NumpyArrayInitializer(self.weight),
                     bias_attr=False
                     if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
                     act=self.act,
                     data_format=self.data_format)
         exe = fluid.Executor(self.place)
@@ -121,9 +119,11 @@ class TestFunctionalConv3D(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
+
         exe = fluid.Executor(self.place)
         exe.run(start)
         feed_dict = {"input": self.input, "weight": self.weight}
@@ -144,10 +144,12 @@ class TestFunctionalConv3D(TestCase):
                 padding=self.padding,
                 stride=self.stride,
                 dilation=self.dilation,
-                act=self.act,
                 groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
+                data_format=self.data_format)
+
+            if self.act == 'sigmoid':
+                y = F.sigmoid(y)
+
             out = y.numpy()
         return out
 
@@ -185,7 +187,6 @@ class TestFunctionalConv3DError(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
     def test_exception(self):
@@ -228,9 +229,10 @@ class TestFunctionalConv3DError(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
 
 
 class TestFunctionalConv3DCase2(TestFunctionalConv3D):
@@ -244,7 +246,6 @@ class TestFunctionalConv3DCase2(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -259,7 +260,6 @@ class TestFunctionalConv3DCase3(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -274,7 +274,6 @@ class TestFunctionalConv3DCase4(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -289,7 +288,6 @@ class TestFunctionalConv3DCase5(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -304,7 +302,6 @@ class TestFunctionalConv3DCase6(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -319,7 +316,6 @@ class TestFunctionalConv3DCase7(TestFunctionalConv3D):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -349,7 +345,6 @@ class TestFunctionalConv3DErrorCase2(TestFunctionalConv3DError):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NCDHW"
 
 
@@ -364,7 +359,6 @@ class TestFunctionalConv3DErrorCase3(TestFunctionalConv3DError):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "not_valid"
 
 
@@ -379,22 +373,6 @@ class TestFunctionalConv3DErrorCase4(TestFunctionalConv3DError):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DErrorCase6(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
         self.data_format = "NCDHW"
 
 
@@ -409,7 +387,6 @@ class TestFunctionalConv3DErrorCase7(TestFunctionalConv3DError):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "not_valid"
 
 
@@ -424,7 +401,6 @@ class TestFunctionalConv3DErrorCase8(TestFunctionalConv3DError):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -439,7 +415,6 @@ class TestFunctionalConv3DErrorCase9(TestFunctionalConv3DError):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NCDHW"
 
 
@@ -454,7 +429,6 @@ class TestFunctionalConv3DErrorCase10(TestFunctionalConv3DError):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NDHWC"
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
index f8e7818315fa077df4d8ad0d6d3f76b47501b5e9..7441f7cb915e8b1fdd2155fff79e145fb6a00c0f 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
@@ -38,7 +38,6 @@ class TestFunctionalConv3DTranspose(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
     def prepare(self):
@@ -90,7 +89,6 @@ class TestFunctionalConv3DTranspose(TestCase):
                     param_attr=I.NumpyArrayInitializer(self.weight),
                     bias_attr=False
                     if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
                     act=self.act,
                     data_format=self.data_format)
         exe = fluid.Executor(self.place)
@@ -115,7 +113,7 @@ class TestFunctionalConv3DTranspose(TestCase):
                     "weight", self.weight.shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv3d_transpose(
+                y = F.conv_transpose3d(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -124,9 +122,9 @@ class TestFunctionalConv3DTranspose(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
         exe = fluid.Executor(self.place)
         exe.run(start)
         feed_dict = {"input": self.input, "weight": self.weight}
@@ -140,7 +138,7 @@ class TestFunctionalConv3DTranspose(TestCase):
             x = dg.to_variable(self.input)
             weight = dg.to_variable(self.weight)
             bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv3d_transpose(
+            y = F.conv_transpose3d(
                 x,
                 weight,
                 bias,
@@ -148,10 +146,10 @@ class TestFunctionalConv3DTranspose(TestCase):
                 padding=self.padding,
                 stride=self.stride,
                 dilation=self.dilation,
-                act=self.act,
                 groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
+                data_format=self.data_format)
+            if self.act == 'sigmoid':
+                y = F.sigmoid(y)
             out = y.numpy()
         return out
 
@@ -190,7 +188,6 @@ class TestFunctionalConv3DTransposeError(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
     def test_exception(self):
@@ -225,7 +222,7 @@ class TestFunctionalConv3DTransposeError(TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv3d_transpose(
+                y = F.conv_transpose3d(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -234,9 +231,9 @@ class TestFunctionalConv3DTransposeError(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
 
 
 class TestFunctionalConv3DTransposeCase2(TestFunctionalConv3DTranspose):
@@ -250,7 +247,6 @@ class TestFunctionalConv3DTransposeCase2(TestFunctionalConv3DTranspose):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -265,7 +261,6 @@ class TestFunctionalConv3DTransposeCase3(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -280,7 +275,6 @@ class TestFunctionalConv3DTransposeCase4(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = True
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -295,7 +289,6 @@ class TestFunctionalConv3DTransposeCase5(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -310,7 +303,6 @@ class TestFunctionalConv3DTransposeCase6(TestFunctionalConv3DTranspose):
         self.groups = 4
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NDHWC"
 
 
@@ -326,7 +318,6 @@ class TestFunctionalConv3DTransposeCase7(TestFunctionalConv3DTranspose):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -341,7 +332,6 @@ class TestFunctionalConv3DTransposeCase8(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -356,7 +346,6 @@ class TestFunctionalConv3DTransposeCase9(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -371,7 +360,6 @@ class TestFunctionalConv3DTransposeCase10(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -386,7 +374,6 @@ class TestFunctionalConv3DTransposeCase11(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -402,7 +389,6 @@ class TestFunctionalConv3DTransposeErrorCase2(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -418,7 +404,6 @@ class TestFunctionalConv3DTransposeErrorCase3(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -434,7 +419,6 @@ class TestFunctionalConv3DTransposeErrorCase4(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -450,23 +434,6 @@ class TestFunctionalConv3DTransposeErrorCase5(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DTransposeErrorCase6(
-        TestFunctionalConv3DTransposeError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
         self.data_format = "NCDHW"
 
 
@@ -483,7 +450,6 @@ class TestFunctionalConv3DTransposeErrorCase7(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -499,7 +465,6 @@ class TestFunctionalConv3DTransposeErrorCase8(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "not_valid"
 
 
@@ -515,7 +480,6 @@ class TestFunctionalConv3DTransposeErrorCase9(
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
index 06f8da84a28d22127fad122d39d4e3903fdf25bf..47671ab3a85e8596d5b677f5e1cf9f6ebecaf155 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from simple_nets import simple_fc_net, fc_with_batchnorm, init_data, bow_net
 from fake_reader import fake_imdb_reader
 from parallel_executor_test_base import TestParallelExecutorBase
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
index 62eef67a5695f62c3594824c24886e23a5a59801..921dbdbc6d4e1b169c2c8aa199ea15f886bd0128 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -19,8 +19,6 @@ import unittest
 
 class TestFuseBatchNormActPass(unittest.TestCase):
     def build_program(self, main_program, startup_program, use_cuda, seed=1):
-        main_program.random_seed = seed
-        startup_program.random_seed = seed
         with fluid.program_guard(main_program, startup_program):
             x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
             y = fluid.layers.data(name="y", shape=[1], dtype='int64')
@@ -59,6 +57,8 @@ class TestFuseBatchNormActPass(unittest.TestCase):
         return x, y, loss
 
     def check(self, place, use_cuda):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_program = fluid.Program()
         startup_program = fluid.Program()
         x, y, loss = self.build_program(main_program, startup_program, use_cuda)
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
index b47bcd2a032a32f30b2bcdd2b48541c660abdab2..a22daeedd09e9a1da3a17773fed43d35ece51bec 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from simple_nets import simple_fc_net, fc_with_batchnorm, init_data, bow_net
 from fake_reader import fake_imdb_reader
 from parallel_executor_test_base import TestParallelExecutorBase
diff --git a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
index 892f63bf15b742c51ddbc15262f888e43cdd03f3..bd934c76ebfa2ed7c9b11223b34c812e605ebe18 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
@@ -18,12 +18,11 @@ import unittest
 import numpy as np
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle
 
 
 class TestGatherNdOpWithEmptyIndex(OpTest):
-    """
-    Index has empty element, which means copy entire tensor
-    """
+    #Index has empty element, which means copy entire tensor
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -40,10 +39,22 @@ class TestGatherNdOpWithEmptyIndex(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestGatherNdOpWithIndex1(OpTest):
+    def setUp(self):
+        self.op_type = "gather_nd"
+        xnp = np.random.random((5, 20)).astype("float64")
+        self.inputs = {'X': xnp, 'Index': np.array([1]).astype("int32")}
+        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 class TestGatherNdOpWithLowIndex(OpTest):
-    """
-    Index has low rank, X has high rank
-    """
+    #Index has low rank, X has high rank
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -61,10 +72,27 @@ class TestGatherNdOpWithLowIndex(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestGatherNdOpIndex1(OpTest):
+    #Index has low rank, X has high rank
+
+    def setUp(self):
+        self.op_type = "gather_nd"
+        xnp = np.random.uniform(0, 100, (10, 10)).astype("float64")
+        index = np.array([1, 2]).astype("int64")
+
+        self.inputs = {'X': xnp, 'Index': index}
+
+        self.outputs = {'Out': xnp[tuple(index.T)]}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 class TestGatherNdOpWithSameIndexAsX(OpTest):
-    """
-    Index has same rank as X's rank
-    """
+    #Index has same rank as X's rank
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -82,9 +110,7 @@ class TestGatherNdOpWithSameIndexAsX(OpTest):
 
 
 class TestGatherNdOpWithHighRankSame(OpTest):
-    """
-    Both Index and X have high rank, and Rank(Index) = Rank(X)
-    """
+    #Both Index and X have high rank, and Rank(Index) = Rank(X)
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -103,9 +129,7 @@ class TestGatherNdOpWithHighRankSame(OpTest):
 
 
 class TestGatherNdOpWithHighRankDiff(OpTest):
-    """
-    Both Index and X have high rank, and Rank(Index) < Rank(X)
-    """
+    #Both Index and X have high rank, and Rank(Index) < Rank(X)
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -162,5 +186,63 @@ class TestGatherNdOpRaise(unittest.TestCase):
         self.assertRaises(IndexError, check_raise_is_test)
 
 
+class TestGatherNdError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+
+            shape = [8, 9, 6]
+            x = paddle.data(shape=shape, dtype='float32', name='x')
+            index = paddle.data(shape=shape, dtype='bool', name='index')
+            index_float = paddle.data(
+                shape=shape, dtype='float32', name='index_float')
+            np_x = np.random.random(shape).astype('float32')
+            np_index = np.array(np.random.randint(2, size=shape, dtype=bool))
+
+            def test_x_type():
+                paddle.gather_nd(np_x, index)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_index_type():
+                paddle.gather_nd(x, np_index)
+
+            self.assertRaises(TypeError, test_index_type)
+
+            def test_index_dtype():
+                paddle.gather_nd(x, index_float)
+
+            self.assertRaises(TypeError, test_index_dtype)
+
+
+class TestGatherNdAPI2(unittest.TestCase):
+    def test_static(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float64')
+            index = fluid.layers.data('index', shape=[-1, 1], dtype='int32')
+            out = paddle.gather_nd(data1, index)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            input = np.array([[1, 2], [3, 4], [5, 6]])
+            index_1 = np.array([[1]])
+            result, = exe.run(feed={"data1": input,
+                                    "index": index_1},
+                              fetch_list=[out])
+            expected_output = np.array([[3, 4]])
+        self.assertTrue(np.allclose(result, expected_output))
+
+    def test_imperative(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
+        index_1 = np.array([[1]])
+        input = fluid.dygraph.to_variable(input_1)
+        index = fluid.dygraph.to_variable(index_1)
+        output = paddle.fluid.layers.gather(input, index)
+        output_np = output.numpy()
+        expected_output = np.array([3, 4])
+        self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index f8763e731eeed3b36a6271167a57b9277479b5ba..1f6e522d2668b5dcd2075ff7af6b4b1ee674632d 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -21,6 +21,13 @@ import paddle
 import paddle.fluid as fluid
 
 
+def gather_numpy(x, index, axis):
+    x_transpose = np.swapaxes(x, 0, axis)
+    tmp_gather = x_transpose[index, ...]
+    gather = np.swapaxes(tmp_gather, 0, axis)
+    return gather
+
+
 class TestGatherOp(OpTest):
     def setUp(self):
         self.op_type = "gather"
@@ -108,12 +115,80 @@ class TestCase6(TestGatherOp):
         self.index_type = "int32"
 
 
+class TestGatherOp1(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        axis_np = np.array(self.axis).astype(self.index_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        out = gather_numpy(xnp, index_np, axis_np[0])
+        self.inputs = {'X': xnp, 'Index': index_np, 'Axis': axis_np}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 88, 3)
+        self.x_type = "float64"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+        self.axis = [1]
+        self.axis_type = "int32"
+
+
+class TestGatherOp2(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 88, 10)
+        self.x_type = "float64"
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+        self.axis = [0]
+        self.axis_type = "int32"
+
+
+class TestGatherOp3(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 88, 10)
+        self.x_type = "float64"
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+        self.axis = [2]
+        self.axis_type = "int32"
+
+
+class TestGatherOp4(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 100, 10)
+        self.x_type = "float64"
+        self.index = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        self.index_type = "int64"
+        self.axis = [0]
+        self.axis_type = "int32"
+
+
 class API_TestGather(unittest.TestCase):
-    def test_out(self):
+    def test_out1(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float64')
-            index = fluid.layers.data('index', shape=[-1, 1], dtype='float64')
-            out = paddle.gather(data1, index)
+            index = fluid.layers.data('index', shape=[-1, 1], dtype='int32')
+            out = paddle.fluid.layers.gather(data1, index)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input = np.array([[1, 2], [3, 4], [5, 6]])
@@ -124,18 +199,103 @@ class API_TestGather(unittest.TestCase):
             expected_output = np.array([[3, 4], [5, 6]])
         self.assertTrue(np.allclose(result, expected_output))
 
+    def test_out2(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.data('x', shape=[-1, 2], dtype='float64')
+            index = paddle.data('index', shape=[-1, 1], dtype='int32')
+            axis = paddle.data('axis', shape=[1], dtype='int32')
+            out = paddle.gather(x, index, axis)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype('float64')
+            index_np = np.array([1, 1]).astype('int32')
+            axis_np = np.array([1]).astype('int32')
+            result, = exe.run(
+                feed={"x": x_np,
+                      "index": index_np,
+                      'axis': axis_np},
+                fetch_list=[out])
+            expected_output = gather_numpy(x_np, index_np, axis_np)
+        self.assertTrue(np.allclose(result, expected_output))
+
 
 class API_TestDygraphGather(unittest.TestCase):
-    def test_out(self):
-        with fluid.dygraph.guard():
-            input_1 = np.array([[1, 2], [3, 4], [5, 6]])
-            index_1 = np.array([1, 2])
-            input = fluid.dygraph.to_variable(input_1)
-            index = fluid.dygraph.to_variable(index_1)
-            output = paddle.fluid.layers.gather(input, index)
-            output_np = output.numpy()
-            expected_output = np.array([[3, 4], [5, 6]])
+    def test_out1(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
+        index_1 = np.array([1, 2])
+        input = paddle.to_tensor(input_1)
+        index = paddle.to_tensor(index_1)
+        output = paddle.fluid.layers.gather(input, index)
+        output_np = output.numpy()
+        expected_output = np.array([[3, 4], [5, 6]])
+        self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+    def test_out12(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
+        index_1 = np.array([1, 2])
+        x = paddle.to_tensor(input_1)
+        index = paddle.to_tensor(index_1)
+        output = paddle.gather(x, index, axis=0)
+        output_np = output.numpy()
+        expected_output = gather_numpy(input_1, index_1, axis=0)
         self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+
+class TestGathertError(unittest.TestCase):
+    def test_error1(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+
+            shape = [8, 9, 6]
+            x = paddle.data(shape=shape, dtype='int8', name='x')
+            axis = paddle.data(shape=[1], dtype='float32', name='axis')
+            index = paddle.data(shape=shape, dtype='int32', name='index')
+            index_float = paddle.data(
+                shape=shape, dtype='float32', name='index_float')
+
+            def test_x_type():
+                paddle.gather(x, index)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_index_type():
+                paddle.gather(x, index_float)
+
+            self.assertRaises(TypeError, test_index_type)
+
+            def test_axis_dtype():
+                paddle.gather(x, index, axis=1.11)
+
+            self.assertRaises(TypeError, test_axis_dtype)
+
+            def test_axis_dtype():
+                paddle.gather(x, index, axis=axis)
+
+            self.assertRaises(TypeError, test_axis_dtype)
+
+    def test_error2(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+
+            shape = [8, 9, 6]
+            x = fluid.data(shape=shape, dtype='int8', name='x')
+            index = fluid.data(shape=shape, dtype='int32', name='mask')
+            index_float = fluid.data(
+                shape=shape, dtype='float32', name='index_float')
+
+            def test_x_type():
+                paddle.fluid.layers.gather(x, index)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_index_type():
+                paddle.fluid.layers.gather(x, index_float)
+
+            self.assertRaises(TypeError, test_index_type)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 6b08c4250f61c9680a13b21f1c6c2e940c60ca75..9ab84404073906a8a95f9eb562cbe220e7c6b455 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
@@ -37,6 +37,7 @@ class TestGaussianRandomOp(OpTest):
             "seed": 10,
             "use_mkldnn": self.use_mkldnn
         }
+        paddle.manual_seed(10)
 
         self.outputs = {'Out': np.zeros((123, 92), dtype='float32')}
 
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
index a5d36203b0ad567a4ba25d686652c9384ea424bf..5054256ca72477785076cdff69266160f6c7d640 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
@@ -224,7 +224,8 @@ def _expand_bbox_targets(bbox_targets_input, class_nums, is_cls_agnostic):
 
 class TestGenerateProposalLabelsOp(OpTest):
     def set_data(self):
-        self.use_random = False
+        #self.use_random = False
+        self.init_use_random()
         self.init_test_cascade()
         self.init_test_params()
         self.init_test_input()
@@ -267,6 +268,9 @@ class TestGenerateProposalLabelsOp(OpTest):
     def init_test_cascade(self, ):
         self.is_cascade_rcnn = False
 
+    def init_use_random(self):
+        self.use_random = False
+
     def init_test_params(self):
         self.batch_size_per_im = 512
         self.fg_fraction = 0.25
@@ -329,6 +333,28 @@ class TestCascade(TestGenerateProposalLabelsOp):
         self.is_cascade_rcnn = True
 
 
+class TestUseRandom(TestGenerateProposalLabelsOp):
+    def init_use_random(self):
+        self.use_random = True
+        self.is_cascade_rcnn = False
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_out)
+
+    def verify_out(self, outs):
+        print("skip")
+
+    def init_test_params(self):
+        self.batch_size_per_im = 512
+        self.fg_fraction = 0.025
+        self.fg_thresh = 0.5
+        self.bg_thresh_hi = 0.5
+        self.bg_thresh_lo = 0.0
+        self.bbox_reg_weights = [0.1, 0.1, 0.2, 0.2]
+        self.is_cls_agnostic = False
+        self.class_nums = 2 if self.is_cls_agnostic else 81
+
+
 class TestClsAgnostic(TestCascade):
     def init_test_params(self):
         self.batch_size_per_im = 512
diff --git a/python/paddle/fluid/tests/unittests/test_generator.py b/python/paddle/fluid/tests/unittests/test_generator.py
index 6cc43d3d5498284e8a24dd272eaed08cdf830733..8b1f420358d3187bd4746431fefe449a4d6ed2ec 100644
--- a/python/paddle/fluid/tests/unittests/test_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_generator.py
@@ -16,6 +16,7 @@
 from __future__ import print_function
 import os
 import unittest
+import paddle
 import paddle.fluid.generator as generator
 import time  # temp for debug
 
@@ -34,10 +35,11 @@ class TestGenerator(unittest.TestCase):
         st = gen.get_state()
         gen.set_state(st)
         gen.random()
-        gen.set_cpu_engine(gen.get_cpu_engine())
 
     def test_basic_generator_error(self):
-        self.assertRaises(ValueError, generator.Generator, device="CUDA")
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.assertRaises(
+                ValueError, generator.Generator, place=paddle.CUDAPlace(0))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
index 6660bfb0c747300741b305a101734e1ef808eeb5..7c1ff41f7e7674936f2725dae3ea19f399cf51e4 100644
--- a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
@@ -35,10 +35,10 @@ def random_reader():
 
 
 def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
+    paddle.manual_seed(1)
+    paddle.framework.random._manual_program_seed(1)
     startup_prog = fluid.Program()
     main_prog = fluid.Program()
-    startup_prog.random_seed = 1
-    main_prog.random_seed = 1
 
     with fluid.unique_name.guard():
         with fluid.program_guard(main_prog, startup_prog):
@@ -124,14 +124,8 @@ class TestBase(unittest.TestCase):
                             label = item['label']
                             assert image.shape() == [BATCH_SIZE, 784]
                             assert label.shape() == [BATCH_SIZE, 1]
-                            if ps[i]._equals(fluid.CPUPlace()):
-                                assert image._place()._equals(fluid.CPUPlace())
-                                assert label._place()._equals(fluid.CPUPlace())
-                            else:
-                                assert image._place()._equals(
-                                    fluid.CUDAPinnedPlace())
-                                assert label._place()._equals(
-                                    fluid.CUDAPinnedPlace())
+                            assert image._place()._equals(ps[i])
+                            assert label._place()._equals(ps[i])
                         L, = exe.run(program=prog,
                                      feed=d,
                                      fetch_list=[loss],
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a33f32a0b6977716d8065419f8e0f88d6c4f44a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import unittest
+
+
+class GridSampleTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 x_shape=[2, 2, 3, 3],
+                 grid_shape=[2, 3, 3, 2],
+                 mode="bilinear",
+                 padding_mode="zeros",
+                 align_corners=False):
+        super(GridSampleTestCase, self).__init__(methodName)
+        self.padding_mode = padding_mode
+        self.x_shape = x_shape
+        self.grid_shape = grid_shape
+        self.mode = mode
+        self.padding_mode = padding_mode
+        self.align_corners = align_corners
+        self.dtype = "float64"
+
+    def setUp(self):
+        self.x = np.random.randn(*(self.x_shape)).astype(self.dtype)
+        self.grid = np.random.uniform(-1, 1, self.grid_shape).astype(self.dtype)
+
+    def static_functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                x = fluid.data("x", self.x_shape, dtype=self.dtype)
+                grid = fluid.data("grid", self.grid_shape, dtype=self.dtype)
+                y_var = F.grid_sample(
+                    x,
+                    grid,
+                    mode=self.mode,
+                    padding_mode=self.padding_mode,
+                    align_corners=self.align_corners)
+        feed_dict = {"x": self.x, "grid": self.grid}
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def dynamic_functional(self):
+        x_t = paddle.to_tensor(self.x)
+        grid_t = paddle.to_tensor(self.grid)
+        y_t = F.grid_sample(
+            x_t,
+            grid_t,
+            mode=self.mode,
+            padding_mode=self.padding_mode,
+            align_corners=self.align_corners)
+        y_np = y_t.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        result1 = self.static_functional(place)
+        with dg.guard(place):
+            result2 = self.dynamic_functional()
+        np.testing.assert_array_almost_equal(result1, result2)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+class GridSampleErrorTestCase(GridSampleTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with self.assertRaises(ValueError):
+            self.static_functional(place)
+
+
+def add_cases(suite):
+    suite.addTest(GridSampleTestCase(methodName='runTest'))
+    suite.addTest(
+        GridSampleTestCase(
+            methodName='runTest',
+            mode='bilinear',
+            padding_mode='reflect',
+            align_corners=True))
+    suite.addTest(
+        GridSampleTestCase(
+            methodName='runTest',
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=True))
+
+
+def add_error_cases(suite):
+    suite.addTest(
+        GridSampleErrorTestCase(
+            methodName='runTest', padding_mode="VALID"))
+    suite.addTest(
+        GridSampleErrorTestCase(
+            methodName='runTest', align_corners="VALID"))
+    suite.addTest(GridSampleErrorTestCase(methodName='runTest', mode="VALID"))
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
index bd5a07769e30de5110566f630de2d480e3426c77..4d1ed5aeb96ebbe064e35c1bee9d5775812440f7 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -17,17 +17,17 @@ import numpy as np
 from op_test import OpTest
 
 
-def AffineGrid(theta, size):
-    n = size[0]
-    h = size[2]
-    w = size[3]
+def AffineGrid(theta, grid_shape):
+    n = grid_shape[0]
+    h = grid_shape[1]
+    w = grid_shape[2]
     h_idx = np.repeat(
         np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
     w_idx = np.repeat(
         np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
     grid = np.concatenate(
         [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
-    grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
+    grid = np.repeat(grid[np.newaxis, :], n, axis=0)  # n * h * w *3
 
     ret = np.zeros([n, h * w, 2])
     theta = theta.transpose([0, 2, 1])
@@ -40,15 +40,19 @@ def AffineGrid(theta, size):
 def getGridPointValue(data, x, y):
     data_shape = data.shape
     N = data_shape[0]
-    H = data_shape[2]
-    W = data_shape[3]
-
-    out = np.zeros(data_shape, dtype='float64')
+    C = data_shape[1]
+    in_H = data_shape[2]
+    in_W = data_shape[3]
+    out_H = x.shape[1]
+    out_W = x.shape[2]
+
+    #out = np.zeros(data_shape, dtype='float64')
+    out = np.zeros([N, C, out_H, out_W], dtype='float64')
     for i in range(N):
-        for j in range(H):
-            for k in range(W):
-                if y[i, j, k] < 0 or y[i, j, k] > H - 1 or x[i, j, k] < 0 or x[
-                        i, j, k] > W - 1:
+        for j in range(out_H):
+            for k in range(out_W):
+                if y[i, j, k] < 0 or y[i, j, k] > in_H - 1 or x[
+                        i, j, k] < 0 or x[i, j, k] > in_W - 1:
                     out[i, :, j, k] = 0
                 else:
                     out[i, :, j, k] = data[i, :, y[i, j, k], x[i, j, k]]
@@ -56,44 +60,89 @@ def getGridPointValue(data, x, y):
     return out
 
 
-def GridSampler(data, grid):
-    dims = data.shape
-    N = dims[0]
-    C = dims[1]
-    H = dims[2]
-    W = dims[3]
+def clip(x, min_n, max_n):
+    return np.maximum(np.minimum(x, max_n), min_n)
 
-    x = grid[:, :, :, 0]
-    y = grid[:, :, :, 1]
-    y_max = H - 1
-    x_max = W - 1
 
-    x = 0.5 * ((x.astype('float64') + 1.0) * x_max)
-    y = 0.5 * ((y.astype('float64') + 1.0) * y_max)
+def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
+    if align_corners:
+        grid_slice = 0.5 * ((grid_slice.astype('float64') + 1.0) * max_val)
+    else:
+        grid_slice = 0.5 * (
+            (grid_slice.astype('float64') + 1.0) * (max_val + 1)) - 0.5
+
+    if padding_mode == "border":
+        grid_slice = clip(grid_slice, 0, max_val)
+    elif padding_mode == "reflect":
+        double_range = 2 * max_val if align_corners else (max_val + 1) * 2
+        grid_abs = np.abs(grid_slice) if align_corners else np.abs(grid_slice +
+                                                                   0.5)
+        extra = grid_abs - np.floor(grid_abs / double_range) * double_range
+        grid_slice = np.minimum(extra, double_range - extra)
+        grid_slice = grid_slice if align_corners else clip(grid_slice - 0.5, 0,
+                                                           max_val)
+    return grid_slice
 
-    x0 = np.floor(x).astype('int32')
-    x1 = x0 + 1
-    y0 = np.floor(y).astype('int32')
-    y1 = y0 + 1
 
-    wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, H, W)), (1, C, 1, 1))
+def GridSampler(data,
+                grid,
+                align_corners=True,
+                mode="bilinear",
+                padding_mode="zeros"):
+    dims = data.shape
+    N = dims[0]
+    in_C = dims[1]
+    in_H = dims[2]
+    in_W = dims[3]
 
-    va = getGridPointValue(data, x0, y0)
-    vb = getGridPointValue(data, x0, y1)
-    vc = getGridPointValue(data, x1, y0)
-    vd = getGridPointValue(data, x1, y1)
+    out_H = grid.shape[1]
+    out_W = grid.shape[2]
 
-    out = (wa * va + wb * vb + wc * vc + wd * vd).astype('float64')
+    x = grid[:, :, :, 0]
+    y = grid[:, :, :, 1]
+    y_max = in_H - 1
+    x_max = in_W - 1
+
+    x = unnormalizeAndClip(x, x_max, align_corners, padding_mode)
+    y = unnormalizeAndClip(y, y_max, align_corners, padding_mode)
+
+    if mode == "bilinear":
+        x0 = np.floor(x).astype('int32')
+        x1 = x0 + 1
+        y0 = np.floor(y).astype('int32')
+        y1 = y0 + 1
+
+        wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+
+        va = getGridPointValue(data, x0, y0)
+        vb = getGridPointValue(data, x0, y1)
+        vc = getGridPointValue(data, x1, y0)
+        vd = getGridPointValue(data, x1, y1)
+
+        out = (wa * va + wb * vb + wc * vc + wd * vd).astype('float64')
+    elif mode == "nearest":
+        x = np.round(x).astype('int32')
+        y = np.round(y).astype('int32')
+        out = getGridPointValue(data, x, y)
     return out
 
 
 class TestGridSamplerOp(OpTest):
     def setUp(self):
-        self.initTestCase()
+        self.use_cudnn = False
+        self.numeric_grad_delta = 0.0001
         self.op_type = 'grid_sampler'
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+        self.initTestCase()
         x = np.random.randint(0, 255, self.x_shape).astype('float64')
 
         theta = np.zeros(self.theta_shape).astype('float64')
@@ -101,22 +150,90 @@ class TestGridSamplerOp(OpTest):
             for j in range(2):
                 for k in range(3):
                     theta[i, j, k] = np.random.rand(1)[0]
-        grid = AffineGrid(theta, self.x_shape)
+        grid = AffineGrid(theta, self.grid_shape)
 
         self.inputs = {'X': x, 'Grid': grid}
-        self.attrs = {'use_cudnn': True}
-        self.outputs = {'Output': GridSampler(x, grid)}
+        self.attrs = {
+            'use_cudnn': self.use_cudnn,
+            "align_corners": self.align_corners,
+            "padding_mode": self.padding_mode,
+            "mode": self.mode
+        }
+        #    print("X: {}".format(x))
+        self.outputs = {
+            'Output': GridSampler(x, grid, self.align_corners, self.mode,
+                                  self.padding_mode)
+        }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Grid'], 'Output', max_relative_error=0.61)
+        self.check_grad(
+            ['X', 'Grid'],
+            'Output',
+            max_relative_error=0.01,
+            numeric_grad_delta=self.numeric_grad_delta)
+
+    def initTestCase(self):
+        self.x_shape = (2, 3, 8, 8)
+        self.grid_shape = (2, 7, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+        self.use_cudnn = True
+
+
+class Case1(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+
+
+class Case1(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "border"
+        self.mode = "bilinear"
+
+
+class Case2(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "reflect"
+        self.mode = "bilinear"
+
+
+class Case3(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "reflect"
+        self.mode = "bilinear"
+
 
+class Case4(TestGridSamplerOp):
     def initTestCase(self):
-        self.x_shape = (2, 5, 7, 3)
-        self.grid_shape = (2, 7, 3, 2)
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "reflect"
+        self.mode = "nearest"
+        self.numeric_grad_delta = 0.0001
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..654e8d6f129e1ffe0dce59113ca88a16d348f210
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestDygraphGroupNormv2(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [2, 6, 2, 2]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    y = gn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    y = gn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [2, 6, 2, 2]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = gn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = gn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs1.py b/python/paddle/fluid/tests/unittests/test_hdfs1.py
new file mode 100644
index 0000000000000000000000000000000000000000..430ed1abe860869d791f0eac17accc8416db1eca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hdfs1.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+
+java_home = os.environ["JAVA_HOME"]
+
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
+
+
+class FSTest1(FSTestBase):
+    def test_timeout(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=6 * 1000,
+            sleep_inter=100)
+        src = "hdfs_test_timeout"
+        dst = "new_hdfs_test_timeout"
+        fs.delete(dst)
+        fs.mkdirs(src)
+        fs.mkdirs(dst)
+        fs.mkdirs(dst + "/" + src)
+        output = ""
+        try:
+            fs.mv(src, dst, test_exists=False)
+            self.assertFalse(1, "can't execute cmd:{} output:{}".format(cmd,
+                                                                        output))
+        except FSTimeOut as e:
+            print("execute mv {} to {} timeout".format(src, dst))
+
+        cmd = "{} -mv {} {}".format(fs._base_cmd, src, dst)
+        ret, output = fluid.core.shell_execute_cmd(cmd, 6 * 1000, 2 * 1000)
+        self.assertNotEqual(ret, 0)
+        print("second mv ret:{} output:{}".format(ret, output))
+
+    def test_is_dir(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=6 * 1000,
+            sleep_inter=100)
+        self.assertFalse(fs.is_dir("./test_hdfs.py"))
+        s = """
+java.io.IOException: Input/output error
+ responseErrorMsg : failed to getFileStatus, errorCode: 3, path: /user/PUBLIC_KM_Data/wangxi16/data/serving_model, lparam: d868f6bb6822c621, errorMessage: inner error
+	at org.apache.hadoop.util.FileSystemUtil.throwException(FileSystemUtil.java:164)
+	at org.apache.hadoop.util.FileSystemUtil.dealWithResponse(FileSystemUtil.java:118)
+	at org.apache.hadoop.lite.client.LiteClientImpl.getFileStatus(LiteClientImpl.java:696)
+	at org.apache.hadoop.fs.LibDFileSystemImpl.getFileStatus(LibDFileSystemImpl.java:297)
+	at org.apache.hadoop.fs.LiteFileSystem.getFileStatus(LiteFileSystem.java:514)
+	at org.apache.hadoop.fs.FsShell.test(FsShell.java:1092)
+	at org.apache.hadoop.fs.FsShell.run(FsShell.java:2285)
+	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
+	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:79)
+	at org.apache.hadoop.fs.FsShell.main(FsShell.java:2353)
+        """
+
+        print("split lines:", s.splitlines())
+        self.assertTrue(fs._test_match(s.splitlines()) != None)
+
+    def test_config(self):
+        config = {"fs.default.name": "hdfs://xxx", "hadoop.job.ugi": "ugi"}
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            config,
+            time_out=6 * 1000,
+            sleep_inter=100)
+
+    def test_exists(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=6 * 1000,
+            sleep_inter=100)
+        self.assertFalse(fs.is_exist(os.path.abspath("./xxxx")))
+        self.assertFalse(fs.is_dir(os.path.abspath("./xxxx")))
+        self.assertTrue(fs.is_dir(os.path.abspath("./xxx/..")))
+        dirs, files = fs.ls_dir(os.path.abspath("./test_hdfs1.py"))
+        self.assertTrue(dirs == [])
+        self.assertTrue(len(files) == 1)
+        dirs, files = fs.ls_dir(os.path.abspath("./xxx/.."))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs2.py b/python/paddle/fluid/tests/unittests/test_hdfs2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7754f89e3c901ac14cb102881e8d338442038559
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+
+java_home = os.environ["JAVA_HOME"]
+
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
+
+
+class FSTest2(FSTestBase):
+    def test_hdfs(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=5 * 1000,
+            sleep_inter=100)
+        self._test_rm(fs)
+        self._test_touch(fs)
+        self._test_dirs(fs)
+
+    def test_local(self):
+        fs = LocalFS()
+        self._test_rm(fs)
+        self._test_touch(fs)
+        self._test_dirs(fs)
+
+        self._test_touch_file(fs)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs3.py b/python/paddle/fluid/tests/unittests/test_hdfs3.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a045f4b17fc9b8b68ccf81a23cb953db58a9db7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hdfs3.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+
+java_home = os.environ["JAVA_HOME"]
+
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
+
+
+class FSTest3(FSTestBase):
+    def test_hdfs(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=5 * 1000,
+            sleep_inter=100)
+        self._test_mkdirs(fs)
+        self._test_list_dir(fs)
+        self._test_try_upload(fs)
+        self._test_try_download(fs)
+
+        self._test_upload(fs)
+        self._test_download(fs)
+
+    def test_local(self):
+        fs = LocalFS()
+        self._test_mkdirs(fs)
+        self._test_list_dir(fs)
+        self._test_try_upload(fs)
+        self._test_try_download(fs)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 5777bb3c6f5e34f035c32ed963906b5ccc03ba85..5c9867e681524f519e267fb744fc4090c836036a 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
@@ -266,8 +267,8 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
 
     def training_test(self, is_sparse):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
+            paddle.manual_seed(1)
             start_up = fluid.default_startup_program()
-            start_up.random_seed = 1  # Fix random seed
             x = np.arange(6).reshape(6)
             path_table = np.array([(1, 2, -1), (1, 2, -1)]).astype('int64')
             path_code = np.array([(1, 0, -1), (0, 0, -1)]).astype('int64')
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index a4f3858d6fb242b8689bd1d300861faf8ed73e54..fdf7adbfb45f0a6133909f4a7d6b488cae09144c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -121,6 +121,7 @@ class TestAmpScaler(unittest.TestCase):
 
         def run_simple_conv(inp_np, use_scaler=True):
             paddle.manual_seed(10)
+            paddle.framework.random._manual_program_seed(10)
             with fluid.dygraph.guard():
                 model = SimpleConv(
                     num_channels=3,
@@ -204,6 +205,7 @@ class TestResnet(unittest.TestCase):
 
         with fluid.dygraph.guard():
             paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             resnet = ResNet(use_cudnn=True)
             optimizer = optimizer_setting(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index 2a25bf6f8abade11d9ad25894753f6d17066e7fd..837e82882e9df8f50ca83a5df20ddf0f03ee504b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -238,8 +238,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
             out2 = linear2(b)
             out1.stop_gradient = True
             out = fluid.layers.concat(input=[out1, out2, c], axis=1)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            out.backward(backward_strategy)
+            out.backward()
             self.assertTrue(linear.weight.gradient() is None)
             self.assertTrue(out1.gradient() is None)
 
@@ -311,9 +310,8 @@ class TestImperativeAutoPrune(unittest.TestCase):
             out2 = linear2(b)
             out1.stop_gradient = True
             out = fluid.layers.concat(input=[out1, out2, c], axis=1)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            out.backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            out.backward()
             self.assertTrue(linear.weight.gradient() is None)
             self.assertTrue(out1.gradient() is None)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index f83f8ef35215e5a0199c4d63744882126212b928..b74182d27ab8c89cc43d3fc1656ca13916d159c1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -314,9 +314,8 @@ class TestImperative(unittest.TestCase):
                 inputs2.append(tmp)
             ret2 = fluid.layers.sums(inputs2)
             loss2 = fluid.layers.reduce_sum(ret2)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            loss2.backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            loss2.backward()
 
             self.assertTrue(np.allclose(ret.numpy(), x * 10))
             self.assertTrue(np.allclose(inputs[0].gradient(), x))
@@ -403,9 +402,8 @@ class TestImperative(unittest.TestCase):
             x2 = l2(var_inp2)[0]
             self.assertIsNotNone(x2)
             dy_out2 = x2.numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            x2.backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            x2.backward()
             dy_grad2 = l2._x_for_debug.gradient()
 
         with new_program_scope():
@@ -442,9 +440,8 @@ class TestImperative(unittest.TestCase):
             mlp2 = MLP(input_size=2)
             out2 = mlp2(var_inp2)
             dy_out2 = out2.numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            out2.backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            out2.backward()
             dy_grad2 = mlp2._linear1.weight.gradient()
 
         with new_program_scope():
@@ -552,9 +549,8 @@ class TestImperative(unittest.TestCase):
             simple_rnn2 = SimpleRNN()
             outs2, pre_hiddens2 = simple_rnn2.forward(var_inp2)
             dy_out2 = outs2[3].numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            outs2[3].backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            outs2[3].backward()
             dy_grad_h2o2 = simple_rnn2._cell._h2o_w.gradient()
             dy_grad_h2h2 = simple_rnn2._cell._h2h_w.gradient()
             dy_grad_i2h2 = simple_rnn2._cell._i2h_w.gradient()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
index d3f488d92ac455072b37274e2ce782bcf41e8cc7..428f97c0af8182efdaab31dbd720e523578f2292 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
@@ -43,7 +43,7 @@ class MLP(fluid.Layer):
 class TestDataParallelStateDict(unittest.TestCase):
     def test_data_parallel_state_dict(self):
         with fluid.dygraph.guard():
-            strategy = paddle.prepare_context()
+            strategy = paddle.distributed.prepare_context()
             mlp = MLP()
             parallel_mlp = dygraph.parallel.DataParallel(mlp, strategy)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index f76c3bd958081070939a85c390eeaeaa389ad5a4..cc6c2f97a9334bf0c3932ecc1fcc18b0b56b2797 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -206,11 +206,10 @@ class TestDygraphDeepCF(unittest.TestCase):
         else:
             (users_np, items_np, labels_np, num_users, num_items,
              matrix) = get_data()
-
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
         startup = fluid.Program()
-        startup.random_seed = seed
         main = fluid.Program()
-        main.random_seed = seed
 
         scope = fluid.core.Scope()
         with new_program_scope(main=main, startup=startup, scope=scope):
@@ -244,8 +243,8 @@ class TestDygraphDeepCF(unittest.TestCase):
                     sys.stderr.write('static loss %s\n' % static_loss)
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             deepcf = DeepCF(num_users, num_items, matrix)
             adam = fluid.optimizer.AdamOptimizer(
@@ -269,14 +268,13 @@ class TestDygraphDeepCF(unittest.TestCase):
                     sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss))
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             deepcf2 = DeepCF(num_users, num_items, matrix)
             adam2 = fluid.optimizer.AdamOptimizer(
                 0.01, parameter_list=deepcf2.parameters())
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
             for e in range(NUM_EPOCHES):
                 sys.stderr.write('epoch %d\n' % e)
                 for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
@@ -289,7 +287,7 @@ class TestDygraphDeepCF(unittest.TestCase):
                         fluid.layers.log_loss(prediction2,
                                               to_variable(labels_np[
                                                   slice:slice + BATCH_SIZE])))
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     adam2.minimize(loss2)
                     deepcf2.clear_gradients()
                     dy_loss2 = loss2.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index 429736803a192a7cdf01522406f95f8e7c892390..720c9f95c251ec54c7e7fa74c8e59e135a8c6be7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -52,8 +52,7 @@ class TestDygraphDoubleGrad(TestCase):
              retain_graph=None,
              create_graph=False,
              allow_unused=False):
-        backward_strategy = fluid.dygraph.BackwardStrategy()
-        backward_strategy.sort_sum_gradient = self.sort_sum_gradient
+        fluid.set_flags({'FLAGS_sort_sum_gradient': self.sort_sum_gradient})
         return fluid.dygraph.grad(
             outputs=outputs,
             inputs=inputs,
@@ -61,8 +60,7 @@ class TestDygraphDoubleGrad(TestCase):
             no_grad_vars=no_grad_vars,
             retain_graph=retain_graph,
             create_graph=create_graph,
-            allow_unused=allow_unused,
-            backward_strategy=backward_strategy)
+            allow_unused=allow_unused)
 
     @dygraph_guard
     def test_exception(self):
@@ -310,10 +308,11 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
                     out = out + linear(input)
             return out
 
-        backward_strategy = fluid.dygraph.BackwardStrategy()
-        backward_strategy.sort_sum_gradient = True
+        fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+
         with fluid.dygraph.guard():
             paddle.manual_seed(123)
+            paddle.framework.random._manual_program_seed(123)
             a = fluid.dygraph.to_variable(value)
             a.stop_gradient = False
 
@@ -324,18 +323,18 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
                 inputs=[a],
                 create_graph=False,
                 only_inputs=True,
-                allow_unused=False,
-                backward_strategy=backward_strategy)
+                allow_unused=False)
 
             grad_1 = dx[0].numpy()
 
         with fluid.dygraph.guard():
             paddle.manual_seed(123)
+            paddle.framework.random._manual_program_seed(123)
             a = fluid.dygraph.to_variable(value)
             a.stop_gradient = False
 
             out = model_f(a)
-            out.backward(backward_strategy)
+            out.backward()
 
             grad_2 = a.gradient()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index b7ebd23a0b74208e768ea4e67b69dc4a596c6764..b752b439f0fa945f75c8cb0c1478668e3dd2c6d5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -56,13 +56,11 @@ class Generator(fluid.Layer):
 class TestDygraphGAN(unittest.TestCase):
     def test_gan_float32(self):
         seed = 90
-
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         startup = fluid.Program()
-        startup.random_seed = seed
         discriminate_p = fluid.Program()
         generate_p = fluid.Program()
-        discriminate_p.random_seed = seed
-        generate_p.random_seed = seed
 
         scope = fluid.core.Scope()
         with new_program_scope(
@@ -133,8 +131,8 @@ class TestDygraphGAN(unittest.TestCase):
 
         dy_params = dict()
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(1)
+            paddle.framework.random._manual_program_seed(1)
 
             discriminator = Discriminator()
             generator = Generator()
@@ -177,11 +175,9 @@ class TestDygraphGAN(unittest.TestCase):
 
         dy_params2 = dict()
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(1)
+            paddle.framework.random._manual_program_seed(1)
             discriminator2 = Discriminator()
             generator2 = Generator()
             sgd2 = SGDOptimizer(
@@ -201,7 +197,7 @@ class TestDygraphGAN(unittest.TestCase):
                     x=d_fake2, label=to_variable(np.zeros([2, 1], np.float32))))
 
             d_loss2 = d_loss_real2 + d_loss_fake2
-            d_loss2.backward(backward_strategy)
+            d_loss2.backward()
             sgd2.minimize(d_loss2)
             discriminator2.clear_gradients()
             generator2.clear_gradients()
@@ -211,7 +207,7 @@ class TestDygraphGAN(unittest.TestCase):
             g_loss2 = fluid.layers.reduce_mean(
                 fluid.layers.sigmoid_cross_entropy_with_logits(
                     x=d_fake2, label=to_variable(np.ones([2, 1], np.float32))))
-            g_loss2.backward(backward_strategy)
+            g_loss2.backward()
             sgd2.minimize(g_loss2)
             for p in discriminator2.parameters():
                 dy_params2[p.name] = p.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
index 01f3c02774698376c576a446f32634583623a737..4db6f2d0da1d5287a3c9ccca7c5459e5915a514f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -61,12 +61,10 @@ class GCN(fluid.Layer):
 
 class TestDygraphGNN(unittest.TestCase):
     def test_gnn_float32(self):
-        seed = 90
-
+        paddle.manual_seed(90)
+        paddle.framework.random._manual_program_seed(90)
         startup = fluid.Program()
-        startup.random_seed = seed
         main = fluid.Program()
-        main.random_seed = seed
 
         scope = fluid.core.Scope()
         with new_program_scope(main=main, startup=startup, scope=scope):
@@ -114,8 +112,8 @@ class TestDygraphGNN(unittest.TestCase):
                 scope.find_var(model.gc.weight.name).get_tensor())
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(90)
+            paddle.framework.random._manual_program_seed(90)
 
             features = np.ones([1, 100, 50], dtype=np.float32)
             # Use selected rows when it's supported.
@@ -140,8 +138,8 @@ class TestDygraphGNN(unittest.TestCase):
             model_gc_weight_value = model.gc.weight.numpy()
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(90)
+            paddle.framework.random._manual_program_seed(90)
 
             features2 = np.ones([1, 100, 50], dtype=np.float32)
             # Use selected rows when it's supported.
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
index 4fe4d963ca5ee4cff1e7073d11361de69e68aa9f..317353684317f6fa0e8cf37cda58f2041e70befd 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
@@ -62,8 +62,7 @@ class Test_Forward_Hook(unittest.TestCase):
             with fluid.dygraph.guard(place):
                 fluid.default_startup_program().random_seed = seed
                 fluid.default_main_program().random_seed = seed
-                backward_strategy = fluid.dygraph.BackwardStrategy()
-                backward_strategy.sort_sum_gradient = True
+                fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
                 input_word = np.array(
                     [0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7,
@@ -132,8 +131,7 @@ class Test_Forward_Hook(unittest.TestCase):
             with fluid.dygraph.guard(place):
                 fluid.default_startup_program().random_seed = seed
                 fluid.default_main_program().random_seed = seed
-                backward_strategy = fluid.dygraph.BackwardStrategy()
-                backward_strategy.sort_sum_gradient = True
+                fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
                 global call_forward_hook
                 global call_forward_pre_hook
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
index a391c088a3640c097ff0f4ff714bf50470c575c6..f61d1ab888a51b2ebe4d1205b30fb84dfa4e7aeb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
@@ -28,11 +28,11 @@ class LeNetDygraph(fluid.dygraph.Layer):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = nn.Sequential(
-            nn.Conv2D(
+            nn.Conv2d(
                 1, 6, 3, stride=1, padding=1),
             nn.ReLU(),
             nn.Pool2D(2, 'max', 2),
-            nn.Conv2D(
+            nn.Conv2d(
                 6, 16, 5, stride=1, padding=0),
             nn.ReLU(),
             nn.Pool2D(2, 'max', 2))
@@ -40,9 +40,8 @@ class LeNetDygraph(fluid.dygraph.Layer):
         if num_classes > 0:
             self.fc = nn.Sequential(
                 nn.Linear(400, 120),
-                nn.Linear(120, 84),
-                nn.Linear(
-                    84, 10, act=classifier_activation))
+                nn.Linear(120, 84), nn.Linear(84, 10),
+                nn.Softmax())  #Todo: accept any activation
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -61,7 +60,7 @@ def init_weights(layer):
         new_bias = paddle.fill_constant(
             layer.bias.shape, layer.bias.dtype, value=-0.1)
         layer.bias.set_value(new_bias)
-    elif type(layer) == nn.Conv2D:
+    elif type(layer) == nn.Conv2d:
         new_weight = paddle.fill_constant(
             layer.weight.shape, layer.weight.dtype, value=0.7)
         layer.weight.set_value(new_weight)
@@ -81,7 +80,7 @@ class TestLayerApply(unittest.TestCase):
                 if type(layer) == nn.Linear:
                     np.testing.assert_allclose(layer.weight.numpy(), 0.9)
                     np.testing.assert_allclose(layer.bias.numpy(), -0.1)
-                elif type(layer) == nn.Conv2D:
+                elif type(layer) == nn.Conv2d:
                     np.testing.assert_allclose(layer.weight.numpy(), 0.7)
                     np.testing.assert_allclose(layer.bias.numpy(), -0.2)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
index e6d8b052d7f1839466d09e4756b49cc6a38554cc..c7e0902341a59649219cf94ef9741fdf7ae09233 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
@@ -27,11 +27,11 @@ class LeNetDygraph(fluid.dygraph.Layer):
     def __init__(self):
         super(LeNetDygraph, self).__init__()
         self.features = nn.Sequential(
-            nn.Conv2D(
+            nn.Conv2d(
                 1, 6, 3, stride=1, padding=1),
             nn.ReLU(),
             nn.Pool2D(2, 'max', 2),
-            nn.Conv2D(
+            nn.Conv2d(
                 6, 16, 5, stride=1, padding=0),
             nn.ReLU(),
             nn.Pool2D(2, 'max', 2))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index 69fd7d80327f1a666870dc76e041449366565b01..f0fea2d7eb75cff376ebce3505e175030619697c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
@@ -94,8 +95,8 @@ class TestDygraphSimpleNet(unittest.TestCase):
 
             for is_sort_sum_gradient in [True, False]:
                 with fluid.dygraph.guard(place):
-                    fluid.default_startup_program().random_seed = seed
-                    fluid.default_main_program().random_seed = seed
+                    paddle.manual_seed(seed)
+                    paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
                         hidden_size=hidden_size,
@@ -113,8 +114,9 @@ class TestDygraphSimpleNet(unittest.TestCase):
                     dy_loss = None
 
                     helper = DyGraphProgramDescTracerTestHelper(self)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = is_sort_sum_gradient
+                    fluid.set_flags({
+                        'FLAGS_sort_sum_gradient': is_sort_sum_gradient
+                    })
 
                     for i in range(batch_num):
                         x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -129,7 +131,7 @@ class TestDygraphSimpleNet(unittest.TestCase):
                         if i == 0:
                             for param in simple_net.parameters():
                                 dy_param_init[param.name] = param.numpy()
-                        dy_loss.backward(backward_strategy)
+                        dy_loss.backward()
                         sgd.minimize(dy_loss)
                         sgd.clear_gradients()
                         if i == batch_num - 1:
@@ -138,8 +140,8 @@ class TestDygraphSimpleNet(unittest.TestCase):
                     dy_loss_value = dy_loss.numpy()
 
                 with new_program_scope():
-                    fluid.default_startup_program().random_seed = seed
-                    fluid.default_main_program().random_seed = seed
+                    paddle.manual_seed(seed)
+                    paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
                         hidden_size=hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
index 4ce0ca350ddb9e8b9873a1650eefa1d5b2db4938..bda1958c0f3544bef51e51cf418ae6c07bdd7056 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
@@ -36,8 +36,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
             mnist2 = MNIST()
             sgd2 = SGDOptimizer(
@@ -69,7 +68,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
                         for param in mnist2.parameters():
                             dy_param_init_value2[param.name] = param.numpy()
 
-                    avg_loss2.backward(backward_strategy)
+                    avg_loss2.backward()
                     sgd2.minimize(avg_loss2)
                     mnist2.clear_gradients()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 246b013f1ada6bc853711e146379b8bb2df5e363..5400b785d2929b4ff8614d4a6dbe26f197bf5ad1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import six
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit
@@ -401,10 +402,9 @@ class TestDygraphOCRAttention(unittest.TestCase):
                 dtype='int64').reshape([1, Config.max_length])))
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             ocr_attention = OCRAttention()
 
             if Config.learning_rate_decay == "piecewise_decay":
@@ -438,7 +438,7 @@ class TestDygraphOCRAttention(unittest.TestCase):
                         for param in ocr_attention.parameters():
                             if param.name not in dy_param_init_value:
                                 dy_param_init_value[param.name] = param.numpy()
-                    avg_loss.backward(backward_strategy)
+                    avg_loss.backward()
                     dy_grad_value = {}
                     for param in ocr_attention.parameters():
                         if param.trainable:
@@ -454,8 +454,8 @@ class TestDygraphOCRAttention(unittest.TestCase):
                         dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             ocr_attention = OCRAttention()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index a7783afc5cff3da97b623aec3297881013724a78..7876675bcc6a1cb5ea190adfa16fb5e4de8c2e35 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -74,8 +74,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
 
         with fluid.dygraph.guard(place):
             try:
-                fluid.default_startup_program().random_seed = seed
-                fluid.default_main_program().random_seed = seed
+                paddle.manual_seed(seed)
+                paddle.framework.random._manual_program_seed(seed)
                 mlp = MLP()
                 optimizer = self.get_optimizer_dygraph(
                     parameter_list=mlp.parameters())
@@ -91,8 +91,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
             ) else fluid.CUDAPlace(0)
 
         with fluid.dygraph.guard(place):
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             mlp = MLP()
             optimizer = self.get_optimizer_dygraph(
@@ -132,8 +132,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             if place == None:
                 place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7756e54168cc90311c443e7a768d4befa2ceda3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -0,0 +1,720 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import contextlib
+import unittest
+import numpy as np
+import six
+import itertools
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer, Adam, MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer
+from paddle.fluid.optimizer import ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer
+from paddle.fluid.dygraph import Linear
+from paddle.fluid.dygraph.base import to_variable
+from test_imperative_base import new_program_scope
+
+# Note(wangzhongpu)
+# In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer.
+
+
+class MLP(fluid.Layer):
+    def __init__(self, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._fc1 = Linear(784, 10)
+        self._fc2 = Linear(10, 10)
+
+    def forward(self, inputs):
+        y = self._fc1(inputs)
+        y = self._fc2(y)
+        return y
+
+
+class TestImperativeOptimizerBase(unittest.TestCase):
+    def setUp(self):
+        self.batch_num = 20
+
+    def get_optimizer_dygraph(self, parameter_list):
+        raise NotImplementedError()
+
+    def get_optimizer(self):
+        raise NotImplementedError()
+
+    def reader_decorator(self, reader):
+        def _reader_imple():
+            for item in reader():
+                image = np.array(item[0]).reshape(1, 784)
+                label = np.array(item[1]).astype('int64').reshape(1)
+                yield image, label
+
+        return _reader_imple
+
+    def _check_exception(self, exception_message, place=None):
+        seed = 90
+        batch_size = 128
+        if place == None:
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+
+        with fluid.dygraph.guard(place):
+            try:
+                paddle.manual_seed(seed)
+                paddle.framework.random._manual_program_seed(seed)
+                mlp = MLP()
+                optimizer = self.get_optimizer_dygraph(
+                    parameter_list=mlp.parameters())
+            except Exception as e:
+                assert str(e) == exception_message
+
+    def _check_mlp(self, place=None):
+        seed = 90
+        batch_size = 128
+
+        if place == None:
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+
+        with fluid.dygraph.guard(place):
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+
+            mlp = MLP()
+            optimizer = self.get_optimizer_dygraph(
+                parameter_list=mlp.parameters())
+
+            batch_py_reader = fluid.io.PyReader(capacity=1)
+            batch_py_reader.decorate_sample_list_generator(
+                paddle.batch(
+                    self.reader_decorator(paddle.dataset.mnist.train()),
+                    batch_size=batch_size,
+                    drop_last=True),
+                places=fluid.CPUPlace())
+
+            dy_param_init_value = {}
+            for batch_id, data in enumerate(batch_py_reader()):
+                if batch_id >= self.batch_num:
+                    break
+
+                img = data[0]
+                label = data[1]
+                label.stop_gradient = True
+
+                img = fluid.layers.reshape(img, shape=[batch_size, -1])
+                cost = mlp(img)
+                avg_loss = fluid.layers.reduce_mean(cost)
+                dy_out = avg_loss.numpy()
+
+                if batch_id == 0:
+                    for param in mlp.parameters():
+                        dy_param_init_value[param.name] = param.numpy()
+
+                avg_loss.backward()
+                optimizer.minimize(avg_loss)
+                mlp.clear_gradients()
+                dy_param_value = {}
+                for param in mlp.parameters():
+                    dy_param_value[param.name] = param.numpy()
+
+        with new_program_scope():
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+
+            if place == None:
+                place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+                ) else fluid.CUDAPlace(0)
+
+            exe = fluid.Executor(place)
+
+            mlp = MLP()
+            optimizer = self.get_optimizer()
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+
+            img = fluid.layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            img = fluid.layers.reshape(img, shape=[batch_size, 784])
+            cost = mlp(img)
+            avg_loss = fluid.layers.reduce_mean(cost)
+            optimizer.minimize(avg_loss)
+
+            # initialize params and fetch them
+            static_param_init_value = {}
+            static_param_name_list = []
+            for param in mlp.parameters():
+                static_param_name_list.append(param.name)
+
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+
+            for i in range(len(static_param_name_list)):
+                static_param_init_value[static_param_name_list[i]] = out[i]
+
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= self.batch_num:
+                    break
+
+                static_x_data = np.array(
+                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    [128, 1])
+
+                fetch_list = [avg_loss.name]
+                fetch_list.extend(static_param_name_list)
+                out = exe.run(fluid.default_main_program(),
+                              feed={"pixel": static_x_data,
+                                    "label": y_data},
+                              fetch_list=fetch_list)
+
+                static_param_value = {}
+                static_out = out[0]
+                for i in range(1, len(out)):
+                    static_param_value[static_param_name_list[i - 1]] = out[i]
+
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
+
+        self.assertTrue(np.allclose(static_out, dy_out))
+
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(np.allclose(value, dy_param_value[key]))
+
+
+class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        bd = [3, 6, 9]
+        optimizer = SGDOptimizer(
+            learning_rate=paddle.optimizer.PiecewiseLR(
+                boundaries=bd,
+                values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        bd = [3, 6, 9]
+        optimizer = SGDOptimizer(learning_rate=paddle.optimizer.PiecewiseLR(
+            boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.natural_exp_decay(
+                learning_rate=0.1,
+                decay_steps=10000,
+                decay_rate=0.5,
+                staircase=True),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
+            learning_rate=0.1,
+            decay_steps=10000,
+            decay_rate=0.5,
+            staircase=True))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.exponential_decay(
+                learning_rate=0.1,
+                decay_steps=10000,
+                decay_rate=0.5,
+                staircase=True),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
+            learning_rate=0.1,
+            decay_steps=10000,
+            decay_rate=0.5,
+            staircase=True))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = Adam(
+            learning_rate=fluid.layers.inverse_time_decay(
+                learning_rate=0.1,
+                decay_steps=10000,
+                decay_rate=0.5,
+                staircase=True),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
+            learning_rate=0.1,
+            decay_steps=10000,
+            decay_rate=0.5,
+            staircase=True))
+        return optimizer
+
+    def test_adam(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.polynomial_decay(
+                learning_rate=0.1, decay_steps=5, cycle=self.cycle),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
+            learning_rate=0.1, decay_steps=5, cycle=self.cycle))
+        return optimizer
+
+    def test_sgd_cycle(self):
+        self.cycle = True
+        self._check_mlp()
+
+    def test_sgd(self):
+        self.cycle = False
+        self._check_mlp()
+
+
+class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.cosine_decay(
+                learning_rate=0.1, step_each_epoch=10000, epochs=120),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
+            learning_rate=0.1, step_each_epoch=10000, epochs=120))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.noam_decay(
+                d_model=512, warmup_steps=8000),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
+            d_model=512, warmup_steps=8000))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestOptimizerLearningRate(unittest.TestCase):
+    def test_constant_lr(self):
+        with fluid.dygraph.guard():
+            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+
+            a = fluid.dygraph.to_variable(a)
+
+            b = linear(a)
+
+            loss = fluid.layers.reduce_mean(b)
+
+            adam = paddle.optimizer.Adam(0.001, parameters=linear.parameters())
+
+            self.assertTrue(
+                np.allclose(
+                    adam.get_lr(), 0.001, rtol=1e-06, atol=0.0))
+
+            for i in range(10):
+                adam.minimize(loss)
+                lr = adam.get_lr()
+
+                self.assertTrue(np.allclose(lr, 0.001, rtol=1e-06, atol=0.0))
+
+    def test_lr_decay(self):
+        with fluid.dygraph.guard():
+            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+
+            a = fluid.dygraph.to_variable(a)
+
+            b = linear(a)
+
+            loss = fluid.layers.reduce_mean(b)
+
+            bd = [2, 4, 6, 8]
+            value = [0.2, 0.4, 0.6, 0.8, 1.0]
+
+            scheduler = paddle.optimizer.PiecewiseLR(bd, value)
+            adam = paddle.optimizer.Adam(
+                scheduler, parameters=linear.parameters())
+
+            self.assertTrue(
+                np.allclose(
+                    adam.get_lr(), 0.2, rtol=1e-06, atol=0.0))
+
+            ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
+            for i in range(12):
+                adam.minimize(loss)
+                lr = adam.get_lr()
+                self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
+                scheduler.step()
+
+    def test_lr_decay_natural_exp(self):
+        with fluid.dygraph.guard():
+            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+
+            a = fluid.dygraph.to_variable(a)
+
+            b = linear(a)
+
+            loss = fluid.layers.reduce_mean(b)
+            base_lr = 1.0
+
+            scheduler = paddle.optimizer.NaturalExpLR(1.0, gamma=0.5)
+            print("scheduler.last_lr", scheduler.last_lr)
+            adam = paddle.optimizer.Adam(
+                scheduler, parameters=linear.parameters())
+
+            self.assertTrue(
+                np.allclose(
+                    adam.get_lr(), 1.0, rtol=1e-06, atol=0.0))
+
+            ret = [1.0, np.exp(-0.5), np.exp(-1)]
+            for i in range(3):
+                adam.minimize(loss)
+                lr = adam.get_lr()
+                self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
+                scheduler.step()
+
+    def test_set_lr(self):
+        with fluid.dygraph.guard():
+            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+
+            a = fluid.dygraph.to_variable(a)
+
+            b = linear(a)
+
+            loss = fluid.layers.reduce_mean(b)
+
+            adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
+
+            lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+            for i in range(5):
+                adam.set_lr(lr_list[i])
+                adam.minimize(loss)
+                lr = adam.get_lr()
+                self.assertTrue(
+                    np.allclose(
+                        lr, lr_list[i], rtol=1e-06, atol=0.0))
+
+            with self.assertRaises(TypeError):
+                lr_var = fluid.layers.create_global_var(
+                    shape=[1], value=0.7, dtype='float32')
+                adam.set_lr(lr_var)
+
+            with self.assertRaises(RuntimeError):
+                adam = paddle.optimizer.Adam(
+                    paddle.optimizer.NaturalExpLR(
+                        learning_rate=0.1, gamma=0.5),
+                    parameters=linear.parameters())
+                adam.set_lr(0.01)
+
+
+class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+        return optimizer
+
+    def test_momentum(self):
+        self._check_mlp()
+
+
+class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = LarsMomentumOptimizer(
+            learning_rate=0.001, momentum=0.9, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9)
+        return optimizer
+
+    def test_larsmomentum(self):
+        self._check_mlp()
+
+
+class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = AdagradOptimizer(
+            learning_rate=0.2, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = AdagradOptimizer(learning_rate=0.2)
+        return optimizer
+
+    def test_adagrad(self):
+        self._check_mlp()
+
+
+class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = AdamaxOptimizer(
+            learning_rate=0.2, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = AdamaxOptimizer(learning_rate=0.2)
+        return optimizer
+
+    def test_adamax(self):
+        self._check_mlp()
+
+
+class TestImperativeDpsgdOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = DpsgdOptimizer(
+            learning_rate=0.01,
+            clip=10.0,
+            batch_size=16.0,
+            sigma=1.0,
+            parameter_list=parameter_list)
+        optimizer._seed = 100
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = DpsgdOptimizer(
+            learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0)
+        optimizer._seed = 100
+        return optimizer
+
+    def test_dpsgd(self):
+        self._check_mlp(place=fluid.CPUPlace())
+
+
+class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = DecayedAdagradOptimizer(
+            learning_rate=0.2, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = DecayedAdagradOptimizer(learning_rate=0.2)
+        return optimizer
+
+    def test_decayadagrad(self):
+        self._check_mlp()
+
+
+class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = AdadeltaOptimizer(
+            learning_rate=0.0003,
+            epsilon=1.0e-6,
+            rho=0.95,
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = AdadeltaOptimizer(
+            learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
+        return optimizer
+
+    def test_adadelta(self):
+        self._check_mlp()
+
+
+class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = RMSPropOptimizer(
+            learning_rate=0.1, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = RMSPropOptimizer(learning_rate=0.1)
+        return optimizer
+
+    def test_rmsprop(self):
+        self._check_mlp()
+
+
+class TestImperativeFtrlOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = FtrlOptimizer(
+            learning_rate=0.1, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = FtrlOptimizer(learning_rate=0.1)
+        return optimizer
+
+    def test_ftrl(self):
+        self._check_mlp()
+
+
+def exclude_fn(param):
+    return param.name.endswith('.b_0')
+
+
+class TestImperativeLambOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = LambOptimizer(
+            learning_rate=0.002,
+            exclude_from_weight_decay_fn=exclude_fn,
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = LambOptimizer(
+            learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn)
+        return optimizer
+
+    def test_lamb(self):
+        self._check_mlp()
+
+
+class TestImperativeModelAverage(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = ModelAverage(
+            0.15, min_average_window=10000, max_average_window=12500)
+        return optimizer
+
+    def test_modelaverage(self):
+        exception_message = "In dygraph, don't support ModelAverage."
+        self._check_exception(exception_message)
+
+
+class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = DGCMomentumOptimizer(
+            learning_rate=0.0001,
+            momentum=0.9,
+            rampup_step=1000,
+            rampup_begin_step=1252,
+            sparsity=[0.999, 0.999])
+        return optimizer
+
+    def test_dgcmomentum(self):
+        exception_message = "In dygraph, don't support DGCMomentumOptimizer."
+        self._check_exception(exception_message)
+
+
+class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = ExponentialMovingAverage(0.999)
+        return optimizer
+
+    def test_exponentialmoving(self):
+        exception_message = "In dygraph, don't support ExponentialMovingAverage."
+        self._check_exception(exception_message)
+
+
+class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.5,
+                                         parameter_list=parameter_list)
+        optimizer = PipelineOptimizer(optimizer)
+        return optimizer
+
+    def test_pipline(self):
+        exception_message = "In dygraph, don't support PipelineOptimizer."
+        self._check_exception(exception_message)
+
+
+class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.5,
+                                         parameter_list=parameter_list)
+        optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5)
+        return optimizer
+
+    def test_lookahead(self):
+        exception_message = "In dygraph, don't support LookaheadOptimizer."
+        self._check_exception(exception_message)
+
+
+class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.5,
+                                         parameter_list=parameter_list)
+        optimizer = RecomputeOptimizer(optimizer)
+        return optimizer
+
+    def test_recompute(self):
+        exception_message = "In dygraph, don't support RecomputeOptimizer."
+        self._check_exception(exception_message)
+
+
+class TestImperativeOptimizerList(unittest.TestCase):
+    def test_parameter_list(self):
+        with fluid.dygraph.guard():
+            linear_1 = Linear(10, 10)
+            linear_2 = Linear(10, 10)
+
+            sgd = SGDOptimizer(
+                1.0,
+                parameter_list=itertools.chain(linear_1.parameters(),
+                                               linear_2.parameters()))
+
+            in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            in_data = fluid.dygraph.to_variable(in_np)
+
+            y = linear_1(in_data)
+            y = linear_2(y)
+            loss = fluid.layers.reduce_mean(y)
+            loss.backward()
+            sgd.minimize(loss)
+
+            self.assertTrue(
+                len(sgd._parameter_list) ==
+                len(linear_1.parameters() + linear_2.parameters()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index bd629f5f4a69a9a8c94f1b2cc58935f0e991ead0..fa23ff8e7c29fa5a07cab03f7407910f687ce9ee 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
@@ -225,8 +226,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         traced_layer = None
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -293,8 +294,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
             dy_last_hidden_value = last_hidden.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
index 8e85fe5dfefea3221fe0566ac506b1277263eec2..0487f8dd9a640b7d337dbc603030e6dd6bbe7ef7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
@@ -43,10 +44,10 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -82,7 +83,7 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
                 if i == 0:
                     for param in ptb_model.parameters():
                         dy_param_init[param.name] = param.numpy()
-                dy_loss.backward(backward_strategy)
+                dy_loss.backward()
                 sgd.minimize(dy_loss)
                 ptb_model.clear_gradients()
                 if i == batch_num - 1:
@@ -94,8 +95,9 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
             dy_last_hidden_value = last_hidden.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
index 735ec4d3f1ea869a17bea4efdba9e5dcedb39fb6..0076c61e584074ed091b9b0c80e9aa5be00e48fb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
@@ -64,8 +64,8 @@ class TestImperativeMnist(unittest.TestCase):
         mask = np.array(mask_list).astype("float32")
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             policy = Policy(input_size=4)
 
@@ -105,8 +105,8 @@ class TestImperativeMnist(unittest.TestCase):
                 dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 815437072fde291b8d8348dba0b4b0ae872ec1b9..e8a2298c17d001abeac2f113df08ee01b94c9422 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -251,8 +251,8 @@ class TestDygraphResnet(unittest.TestCase):
         traced_layer = None
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             resnet = ResNet()
             optimizer = optimizer_setting(
@@ -334,8 +334,8 @@ class TestDygraphResnet(unittest.TestCase):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
index 8cbd08ea3e245f70a6a4aceb3f6c9e0b83356981..13b12da3318cad709b1978dc581ff479a1d842c6 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
@@ -77,10 +77,10 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
         batch_size = train_parameters["batch_size"]
         batch_num = 10
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+
             resnet = ResNet()
             optimizer = optimizer_setting(
                 train_parameters, parameter_list=resnet.parameters())
@@ -119,7 +119,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
                         if param.name not in dy_param_init_value:
                             dy_param_init_value[param.name] = param.numpy()
 
-                avg_loss.backward(backward_strategy)
+                avg_loss.backward()
 
                 dy_grad_value = {}
                 for param in resnet.parameters():
@@ -137,8 +137,8 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index eb9dc926c8207f4de4a6ce7e3d0dc89cc2b965fd..48aea3a584dd25667704b22d99d1074c481bb76c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -219,8 +219,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -305,8 +305,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -374,6 +374,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 adam._learning_rate.step_num = 0
 
             para_state_dict, opti_state_dict = paddle.load("./test_dy")
+            print(opti_state_dict['LR_Scheduler'])
             adam.set_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
@@ -414,8 +415,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -521,8 +522,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -634,8 +635,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -713,8 +712,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -804,9 +803,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
+
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e81d1c8610f6bebffadf930b67dc14a4a418ef05
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -0,0 +1,922 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.dygraph.nn import Embedding, Linear
+import paddle.fluid.framework as framework
+from paddle.optimizer import Adam
+from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
+from test_imperative_base import new_program_scope
+import numpy as np
+import six
+import paddle
+
+
+class SimpleLSTMRNN(fluid.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_steps,
+                 num_layers=2,
+                 init_scale=0.1,
+                 dropout=None):
+        super(SimpleLSTMRNN, self).__init__()
+        self._hidden_size = hidden_size
+        self._num_layers = num_layers
+        self._init_scale = init_scale
+        self._dropout = dropout
+        self._input = None
+        self._num_steps = num_steps
+        self.cell_array = []
+        self.hidden_array = []
+        self.weight_1_arr = []
+        self.weight_2_arr = []
+        self.bias_arr = []
+        self.mask_array = []
+
+        for i in range(self._num_layers):
+            weight_1 = self.create_parameter(
+                attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.UniformInitializer(
+                        low=-self._init_scale, high=self._init_scale)),
+                shape=[self._hidden_size * 2, self._hidden_size * 4],
+                dtype="float32",
+                default_initializer=fluid.initializer.UniformInitializer(
+                    low=-self._init_scale, high=self._init_scale))
+            self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
+            bias_1 = self.create_parameter(
+                attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.UniformInitializer(
+                        low=-self._init_scale, high=self._init_scale)),
+                shape=[self._hidden_size * 4],
+                dtype="float32",
+                default_initializer=fluid.initializer.Constant(0.0))
+            self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
+
+    def forward(self, input_embedding, init_hidden=None, init_cell=None):
+        self.cell_array = []
+        self.hidden_array = []
+
+        for i in range(self._num_layers):
+            pre_hidden = fluid.layers.slice(
+                init_hidden, axes=[0], starts=[i], ends=[i + 1])
+            pre_cell = fluid.layers.slice(
+                init_cell, axes=[0], starts=[i], ends=[i + 1])
+            pre_hidden = fluid.layers.reshape(
+                pre_hidden, shape=[-1, self._hidden_size])
+            pre_cell = fluid.layers.reshape(
+                pre_cell, shape=[-1, self._hidden_size])
+            self.hidden_array.append(pre_hidden)
+            self.cell_array.append(pre_cell)
+
+        res = []
+        for index in range(self._num_steps):
+            self._input = fluid.layers.slice(
+                input_embedding, axes=[1], starts=[index], ends=[index + 1])
+            self._input = fluid.layers.reshape(
+                self._input, shape=[-1, self._hidden_size])
+            for k in range(self._num_layers):
+                pre_hidden = self.hidden_array[k]
+                pre_cell = self.cell_array[k]
+                weight_1 = self.weight_1_arr[k]
+                bias = self.bias_arr[k]
+
+                nn = fluid.layers.concat([self._input, pre_hidden], 1)
+                gate_input = fluid.layers.matmul(x=nn, y=weight_1)
+
+                gate_input = fluid.layers.elementwise_add(gate_input, bias)
+                i, j, f, o = fluid.layers.split(
+                    gate_input, num_or_sections=4, dim=-1)
+                c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
+                    i) * fluid.layers.tanh(j)
+                m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
+                self.hidden_array[k] = m
+                self.cell_array[k] = c
+                self._input = m
+
+                if self._dropout is not None and self._dropout > 0.0:
+                    self._input = fluid.layers.dropout(
+                        self._input,
+                        dropout_prob=self._dropout,
+                        dropout_implementation='upscale_in_train')
+            res.append(
+                fluid.layers.reshape(
+                    self._input, shape=[1, -1, self._hidden_size]))
+        real_res = fluid.layers.concat(res, 0)
+        real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2])
+        last_hidden = fluid.layers.concat(self.hidden_array, 1)
+        last_hidden = fluid.layers.reshape(
+            last_hidden, shape=[-1, self._num_layers, self._hidden_size])
+        last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2])
+        last_cell = fluid.layers.concat(self.cell_array, 1)
+        last_cell = fluid.layers.reshape(
+            last_cell, shape=[-1, self._num_layers, self._hidden_size])
+        last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2])
+        return real_res, last_hidden, last_cell
+
+
+class PtbModel(fluid.Layer):
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 num_layers=2,
+                 num_steps=20,
+                 init_scale=0.1,
+                 dropout=None):
+        super(PtbModel, self).__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.init_scale = init_scale
+        self.num_layers = num_layers
+        self.num_steps = num_steps
+        self.dropout = dropout
+        self.simple_lstm_rnn = SimpleLSTMRNN(
+            hidden_size,
+            num_steps,
+            num_layers=num_layers,
+            init_scale=init_scale,
+            dropout=dropout)
+        self.embedding = Embedding(
+            size=[vocab_size, hidden_size],
+            dtype='float32',
+            is_sparse=False,
+            param_attr=fluid.ParamAttr(
+                name='embedding_para',
+                initializer=fluid.initializer.UniformInitializer(
+                    low=-init_scale, high=init_scale)))
+
+        self.softmax_weight = self.create_parameter(
+            attr=fluid.ParamAttr(),
+            shape=[self.hidden_size, self.vocab_size],
+            dtype="float32",
+            default_initializer=fluid.initializer.UniformInitializer(
+                low=-self.init_scale, high=self.init_scale))
+        self.softmax_bias = self.create_parameter(
+            attr=fluid.ParamAttr(),
+            shape=[self.vocab_size],
+            dtype="float32",
+            default_initializer=fluid.initializer.UniformInitializer(
+                low=-self.init_scale, high=self.init_scale))
+
+    def forward(self, input, label, init_hidden, init_cell):
+        init_h = fluid.layers.reshape(
+            init_hidden, shape=[self.num_layers, -1, self.hidden_size])
+
+        init_c = fluid.layers.reshape(
+            init_cell, shape=[self.num_layers, -1, self.hidden_size])
+
+        x_emb = self.embedding(input)
+        x_emb = fluid.layers.reshape(
+            x_emb, shape=[-1, self.num_steps, self.hidden_size])
+        if self.dropout is not None and self.dropout > 0.0:
+            x_emb = fluid.layers.dropout(
+                x_emb,
+                dropout_prob=self.drop_out,
+                dropout_implementation='upscale_in_train')
+        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
+                                                               init_c)
+        rnn_out = fluid.layers.reshape(
+            rnn_out, shape=[-1, self.num_steps, self.hidden_size])
+
+        projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
+        projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
+        projection = fluid.layers.reshape(
+            projection, shape=[-1, self.vocab_size])
+        loss = fluid.layers.softmax_with_cross_entropy(
+            logits=projection, label=label, soft_label=False)
+        loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
+        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = fluid.layers.reduce_sum(loss)
+
+        return loss, last_hidden, last_cell
+
+
+class TestDygraphPtbRnn(unittest.TestCase):
+    def setUp(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [1.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                new_lr = 1.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
+            adam = Adam(
+                learning_rate=scheduler, parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            for i in range(batch_num):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+                if i == 0:
+                    for param in ptb_model.parameters():
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                scheduler.step()
+                ptb_model.clear_gradients()
+
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param.numpy()
+
+            # check optimizer
+            self.opti_dict = adam.state_dict()
+            self.base_opti = {}
+            for k, v in self.opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    self.base_opti[v.name] = v.numpy()
+                    self.assertTrue(np.sum(np.abs(v.numpy())) != 0)
+                else:
+                    self.base_opti[k] = v
+
+            fluid.save_dygraph(self.opti_dict, "./test_dy_v2")
+
+            self.state_dict = ptb_model.state_dict()
+
+            self.model_base = {}
+            for k, v in self.state_dict.items():
+                np_t = v.numpy()
+                self.model_base[k] = np_t
+
+            paddle.save(self.state_dict, "./test_dy_v2")
+
+    def testLoadAndSetVarBase(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [1.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                new_lr = 1.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
+            adam = Adam(
+                learning_rate=scheduler, parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            for i in range(batch_num):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+                if i == 0:
+                    for param in ptb_model.parameters():
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                scheduler.step()
+                ptb_model.clear_gradients()
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param.numpy()
+
+            # check optimizer
+            opti_dict = adam.state_dict()
+            # set to zero
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    np_t = v.numpy()
+                    var = v.value().get_tensor()
+                    var.set(np.zeros_like(np_t), place)
+
+                    self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+
+            para_state_dict, opti_state_dict = paddle.load("./test_dy_v2")
+            adam.set_state_dict(opti_state_dict)
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])
+
+            # check parameter
+            state_dict = ptb_model.state_dict()
+            for k, v in state_dict.items():
+                np_t = v.numpy()
+                var = v.value().get_tensor()
+
+                var.set(np.zeros_like(np_t), place)
+
+            ptb_model.set_dict(para_state_dict)
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testSetVariable(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [1.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                new_lr = 1.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
+            adam = Adam(
+                learning_rate=scheduler, parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            for i in range(batch_num):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+                if i == 0:
+                    for param in ptb_model.parameters():
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                scheduler.step()
+                ptb_model.clear_gradients()
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param.numpy()
+
+            # check optimizer
+            opti_dict = adam.state_dict()
+            # set to zero
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    np_t = v.numpy()
+                    var = v.value().get_tensor()
+                    var.set(np.zeros_like(np_t), place)
+
+                    self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+
+            if isinstance(adam._learning_rate, LearningRateDecay):
+                adam._learning_rate.step_num = 0
+
+            adam.set_state_dict(self.opti_dict)
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])
+
+            # check parameter
+            state_dict = ptb_model.state_dict()
+            for k, v in state_dict.items():
+                np_t = v.numpy()
+                var = v.value().get_tensor()
+
+                var.set(np.zeros_like(np_t), place)
+
+            ptb_model.set_dict(self.state_dict)
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testSetNumpy(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [1.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                new_lr = 1.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
+            adam = Adam(
+                learning_rate=scheduler, parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            for i in range(batch_num):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+                if i == 0:
+                    for param in ptb_model.parameters():
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                scheduler.step()
+                ptb_model.clear_gradients()
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param.numpy()
+
+            # check optimizer
+            opti_dict = adam.state_dict()
+            np_opti_dict = {}
+            # set to zero
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    np_t = v.numpy()
+                    np_opti_dict[v.name] = np_t
+                    var = v.value().get_tensor()
+                    var.set(np.zeros_like(np_t), place)
+                    self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+                else:
+                    np_opti_dict[k] = v
+
+            if isinstance(adam._learning_rate, LearningRateDecay):
+                adam._learning_rate.step_num = 0
+
+            adam.set_state_dict(np_opti_dict)
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])
+
+            # check parameter
+            state_dict = ptb_model.state_dict()
+            np_state_dict = {}
+            for k, v in state_dict.items():
+                np_t = v.numpy()
+                np_state_dict[k] = np_t
+                var = v.value().get_tensor()
+
+                var.set(np.zeros_like(np_t), place)
+
+            ptb_model.set_dict(np_state_dict)
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testSetVariableBeforeTrain(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            adam = Adam(
+                learning_rate=0.0,
+                beta1=0.8,
+                beta2=0.6,
+                parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            adam.set_state_dict(self.opti_dict)
+            ptb_model.set_dict(self.state_dict)
+
+            for i in range(1):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                ptb_model.clear_gradients()
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if k == "global_step":
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] + 1))
+
+                if k.find("beta1_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta1))
+                if k.find("beta2_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta2))
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testLoadAndSetVarBaseBeforeTrain(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [0.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                # set lr to zero not update parameter
+                new_lr = 0.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            adam = Adam(
+                learning_rate=0.0,
+                beta1=0.8,
+                beta2=0.6,
+                parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            state_dict, opti_dict = fluid.load_dygraph("./test_dy_v2")
+            adam.set_state_dict(opti_dict)
+            ptb_model.set_dict(state_dict)
+
+            for i in range(1):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                ptb_model.clear_gradients()
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if k == "global_step":
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] + 1))
+
+                if k.find("beta1_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta1))
+                if k.find("beta2_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta2))
+
+            # check parameter
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testSetNumpyBeforeTrain(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [0.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                # set lr to 0.0, not update parameter
+                new_lr = 0.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
+            adam = Adam(
+                learning_rate=scheduler,
+                beta1=0.8,
+                beta2=0.6,
+                parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            np_opti_dict = {}
+            np_state_dict = {}
+
+            for k, v in self.opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    np_opti_dict[v.name] = v.numpy()
+                else:
+                    np_opti_dict[k] = v
+
+            for k, v in self.state_dict.items():
+                np_state_dict[k] = v.numpy()
+
+            adam.set_state_dict(np_opti_dict)
+            ptb_model.set_dict(np_state_dict)
+            for i in range(1):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+
+                dy_loss.backward()
+                scheduler.step()
+                adam.minimize(dy_loss)
+                ptb_model.clear_gradients()
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if k == "LR_Scheduler":
+                    self.assertTrue(
+                        np.array_equal(v['last_epoch'], self.base_opti[k][
+                            'last_epoch'] + 1))
+
+                if k.find("beta1_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta1))
+                if k.find("beta2_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta2))
+
+            # check parameter
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testOnlyLoadParams(self):
+        with fluid.dygraph.guard():
+            emb = fluid.dygraph.Embedding([10, 10])
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy'))
+
+            self.assertTrue(opti_state_dict == None)
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy.pdparams'))
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy.pdopt'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 283addaf6283a5365d983e4737bf2a8fdf5ee0b9..a04e1e4e5aafeeb605348b30125c5d42b3171674 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -308,8 +308,8 @@ class TestImperativeResneXt(unittest.TestCase):
         batch_num = 1
         epoch_num = 1
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             se_resnext = SeResNeXt()
             optimizer = optimizer_setting(
@@ -367,8 +367,8 @@ class TestImperativeResneXt(unittest.TestCase):
                         dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
index 9878e2f9ad772fe3d03addb4ced9f3b66a6cd58a..59ddb365e539603c1eba06ca8828fc244b6e542d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -48,8 +48,9 @@ class TestSimpleNet(unittest.TestCase):
             for dtype in ["float32", "float64"]:
                 for sort_sum_gradient in [True, False]:
                     paddle.disable_static(place)
-                    backward_strategy = paddle.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = sort_sum_gradient
+                    fluid.set_flags({
+                        'FLAGS_sort_sum_gradient': sort_sum_gradient
+                    })
                     # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
@@ -65,7 +66,7 @@ class TestSimpleNet(unittest.TestCase):
                     self.assertTrue(emb.weight.gradient() is None)
                     self.assertTrue(input_emb.gradient() is None)
 
-                    input_emb.backward(backward_strategy)
+                    input_emb.backward()
                     adam.minimize(input_emb)
                     self.assertTrue(emb.weight.gradient() is not None)
 
@@ -84,8 +85,9 @@ class TestSimpleNet(unittest.TestCase):
         for place in places:
             for sort_sum_gradient in [True, False]:
                 with fluid.dygraph.guard(place):
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = sort_sum_gradient
+                    fluid.set_flags({
+                        'FLAGS_sort_sum_gradient': sort_sum_gradient
+                    })
                     grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
@@ -101,7 +103,7 @@ class TestSimpleNet(unittest.TestCase):
                     self.assertTrue(emb.weight.gradient() is None)
                     self.assertTrue(input_emb.gradient() is None)
 
-                    input_emb.backward(backward_strategy)
+                    input_emb.backward()
                     adam.minimize(input_emb)
                     self.assertTrue(emb.weight.gradient() is not None)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
index a42a62019ba54a771d26ad853e39fcf8ca991180..794f59e48507e6002311e54e8ae31f3ad1bf4647 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
@@ -101,8 +102,8 @@ class TestDygraphSimpleNet(unittest.TestCase):
             for is_sort_sum_gradient in [True, False]:
                 traced_layer = None
                 with fluid.dygraph.guard(place):
-                    fluid.default_startup_program().random_seed = seed
-                    fluid.default_main_program().random_seed = seed
+                    paddle.manual_seed(seed)
+                    paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
                         hidden_size=hidden_size,
@@ -119,8 +120,9 @@ class TestDygraphSimpleNet(unittest.TestCase):
                     dy_param_init = dict()
                     dy_loss = None
 
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = is_sort_sum_gradient
+                    fluid.set_flags({
+                        'FLAGS_sort_sum_gradient': is_sort_sum_gradient
+                    })
 
                     for i in range(batch_num):
                         x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -135,7 +137,7 @@ class TestDygraphSimpleNet(unittest.TestCase):
                         if i == 0:
                             for param in simple_net.parameters():
                                 dy_param_init[param.name] = param.numpy()
-                        dy_loss.backward(backward_strategy)
+                        dy_loss.backward()
                         sgd.minimize(dy_loss)
                         sgd.clear_gradients()
                         if i == batch_num - 1:
@@ -144,8 +146,8 @@ class TestDygraphSimpleNet(unittest.TestCase):
                     dy_loss_value = dy_loss.numpy()
 
                 with new_program_scope():
-                    fluid.default_startup_program().random_seed = seed
-                    fluid.default_main_program().random_seed = seed
+                    paddle.manual_seed(seed)
+                    paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
                         hidden_size=hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index 649dc1ad91d3878dacc551fd08527885c3f479aa..e94157fa047eef065bc4bd0bfb3d6b6c778ea7b9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -468,8 +468,8 @@ def build_optimizer(layer, cfg, loss=None):
 
 class DyGraphTrainModel(object):
     def __init__(self, cfg):
-        fluid.default_startup_program().random_seed = cfg.seed
-        fluid.default_main_program().random_seed = cfg.seed
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
 
         self.generator = Generator(cfg)
         self.discriminator = Discriminator(cfg)
@@ -479,8 +479,7 @@ class DyGraphTrainModel(object):
 
         self.cfg = cfg
 
-        self.backward_strategy = fluid.dygraph.BackwardStrategy()
-        self.backward_strategy.sort_sum_gradient = cfg.sort_sum_gradient
+        fluid.set_flags({'FLAGS_sort_sum_gradient': cfg.sort_sum_gradient})
 
     def clear_gradients(self):
         if self.g_optimizer:
@@ -497,7 +496,7 @@ class DyGraphTrainModel(object):
         g_loss = get_generator_loss(image_real, label_org, label_trg,
                                     self.generator, self.discriminator,
                                     self.cfg)
-        g_loss.backward(self.backward_strategy)
+        g_loss.backward()
         if self.g_optimizer:
             self.g_optimizer.minimize(g_loss)
 
@@ -506,7 +505,7 @@ class DyGraphTrainModel(object):
         d_loss = get_discriminator_loss(image_real, label_org, label_trg,
                                         self.generator, self.discriminator,
                                         self.cfg)
-        d_loss.backward(self.backward_strategy)
+        d_loss.backward()
         if self.d_optimizer:
             self.d_optimizer.minimize(d_loss)
 
@@ -530,12 +529,12 @@ class StaticGraphTrainModel(object):
                 shape=[None, cfg.c_dim], dtype='float32', name='label_trg')
             return image_real, label_org, label_trg
 
+        paddle.manual_seed(cfg.seed)
+        paddle.framework.random._manual_program_seed(cfg.seed)
         self.gen_program = fluid.Program()
         gen_startup_program = fluid.Program()
 
         with fluid.program_guard(self.gen_program, gen_startup_program):
-            self.gen_program.random_seed = cfg.seed
-            gen_startup_program.random_seed = cfg.seed
             with fluid.unique_name.guard():
                 image_real, label_org, label_trg = create_data_layer()
                 generator = Generator(cfg)
@@ -547,8 +546,6 @@ class StaticGraphTrainModel(object):
         self.dis_program = fluid.Program()
         dis_startup_program = fluid.Program()
         with fluid.program_guard(self.dis_program, dis_startup_program):
-            self.dis_program.random_seed = cfg.seed
-            dis_startup_program.random_seed = cfg.seed
             with fluid.unique_name.guard():
                 image_real, label_org, label_trg = create_data_layer()
                 generator = Generator(cfg)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
index acc56b7db27f48cad92ed44cddfcfd4b9591dba3..f10d2df7f06f98334df62d3021403d686054b7d9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
@@ -121,8 +121,7 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase):
         with fluid.dygraph.guard(place):
             fluid.default_startup_program().random_seed = self.seed
             fluid.default_main_program().random_seed = self.seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
             mnist = fluid.dygraph.static_runner.StaticModelRunner(
                 model_dir=self.save_dirname,
@@ -156,7 +155,7 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase):
                     loss = fluid.layers.cross_entropy(cost, label)
                     avg_loss = fluid.layers.mean(loss)
 
-                    avg_loss.backward(backward_strategy)
+                    avg_loss.backward()
                     sgd.minimize(avg_loss)
                     mnist.clear_gradients()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
index 0792582175ef03cba3d3ba809132f3c591ecfe87..db47170c7bfff4575a9b4dcf694cd8ed722b0b8f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
@@ -111,9 +111,7 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase):
             fluid.default_startup_program().random_seed = self.seed
             fluid.default_main_program().random_seed = self.seed
             np.random.seed(self.seed)
-
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
             while_net = fluid.dygraph.static_runner.StaticModelRunner(
                 self.save_dirname)
@@ -141,7 +139,7 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase):
                 loss = fluid.layers.cross_entropy(cost, label)
                 avg_loss = fluid.layers.mean(loss)
 
-                avg_loss.backward(backward_strategy)
+                avg_loss.backward()
                 sgd.minimize(avg_loss)
                 while_net.clear_gradients()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index 29cc718f14ff98de2b668d313d380d784cbaa6ef..9f58ef881e4e47365be03cdb5786ac292c938f03 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Embedding, LayerNorm, Linear, Layer
 from paddle.fluid.dygraph import to_variable, guard
@@ -949,10 +950,9 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
         seed = 90
 
         with guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             transformer = TransFormer(
                 ModelHyperParams.src_vocab_size,
                 ModelHyperParams.trg_vocab_size,
@@ -1021,7 +1021,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
                     for param in transformer.parameters():
                         dy_param_init[param.name] = param.numpy()
 
-                dy_avg_cost.backward(backward_strategy)
+                dy_avg_cost.backward()
                 optimizer.minimize(dy_avg_cost)
                 transformer.clear_gradients()
 
@@ -1035,8 +1035,8 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
             dy_token_num_value = dy_token_num.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             transformer = TransFormer(
                 ModelHyperParams.src_vocab_size,
                 ModelHyperParams.trg_vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/test_input_spec.py b/python/paddle/fluid/tests/unittests/test_input_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..e329a37488a2cb8234532cd0a9beb7a1a25e72a6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_input_spec.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+from paddle.static import InputSpec
+from paddle.fluid.framework import core, convert_np_dtype_to_dtype_
+
+
+class TestInputSpec(unittest.TestCase):
+    def test_default(self):
+        tensor_spec = InputSpec([3, 4])
+        self.assertEqual(tensor_spec.dtype,
+                         convert_np_dtype_to_dtype_('float32'))
+        self.assertEqual(tensor_spec.name, None)
+
+    def test_from_tensor(self):
+        x_bool = fluid.layers.fill_constant(shape=[1], dtype='bool', value=True)
+        bool_spec = InputSpec.from_tensor(x_bool)
+        self.assertEqual(bool_spec.dtype, x_bool.dtype)
+        self.assertEqual(bool_spec.shape, x_bool.shape)
+        self.assertEqual(bool_spec.name, x_bool.name)
+
+        bool_spec2 = InputSpec.from_tensor(x_bool, name='bool_spec')
+        self.assertEqual(bool_spec2.name, bool_spec2.name)
+
+    def test_from_numpy(self):
+        x_numpy = np.ones([10, 12])
+        x_np_spec = InputSpec.from_numpy(x_numpy)
+        self.assertEqual(x_np_spec.dtype,
+                         convert_np_dtype_to_dtype_(x_numpy.dtype))
+        self.assertEqual(x_np_spec.shape, x_numpy.shape)
+        self.assertEqual(x_np_spec.name, None)
+
+        x_numpy2 = np.array([1, 2, 3, 4]).astype('int64')
+        x_np_spec2 = InputSpec.from_numpy(x_numpy2, name='x_np_int64')
+        self.assertEqual(x_np_spec2.dtype,
+                         convert_np_dtype_to_dtype_(x_numpy2.dtype))
+        self.assertEqual(x_np_spec2.shape, x_numpy2.shape)
+        self.assertEqual(x_np_spec2.name, 'x_np_int64')
+
+    def test_shape_with_none(self):
+        tensor_spec = InputSpec([None, 4, None], dtype='int8', name='x_spec')
+        self.assertEqual(tensor_spec.dtype, convert_np_dtype_to_dtype_('int8'))
+        self.assertEqual(tensor_spec.name, 'x_spec')
+        self.assertEqual(tensor_spec.shape, (-1, 4, -1))
+
+    def test_shape_raise_error(self):
+        # 1. shape should only contain int and None.
+        with self.assertRaises(ValueError):
+            tensor_spec = InputSpec(['None', 4, None], dtype='int8')
+
+        # 2. shape should be type `list` or `tuple`
+        with self.assertRaises(TypeError):
+            tensor_spec = InputSpec(4, dtype='int8')
+
+        # 3. len(shape) should be greater than 0.
+        with self.assertRaises(ValueError):
+            tensor_spec = InputSpec([], dtype='int8')
+
+    def test_batch_and_unbatch(self):
+        tensor_spec = InputSpec([10])
+        # insert batch_size
+        batch_tensor_spec = tensor_spec.batch(16)
+        self.assertEqual(batch_tensor_spec.shape, (16, 10))
+
+        # unbatch
+        unbatch_spec = batch_tensor_spec.unbatch()
+        self.assertEqual(unbatch_spec.shape, (10, ))
+
+        # 1. `unbatch` requires len(shape) > 1
+        with self.assertRaises(ValueError):
+            unbatch_spec.unbatch()
+
+        # 2. `batch` requires len(batch_size) == 1
+        with self.assertRaises(ValueError):
+            tensor_spec.batch([16, 12])
+
+        # 3. `batch` requires type(batch_size) == int
+        with self.assertRaises(TypeError):
+            tensor_spec.batch('16')
+
+    def test_eq_and_hash(self):
+        tensor_spec_1 = InputSpec([10, 16], dtype='float32')
+        tensor_spec_2 = InputSpec([10, 16], dtype='float32')
+        tensor_spec_3 = InputSpec([10, 16], dtype='float32', name='x')
+        tensor_spec_4 = InputSpec([16], dtype='float32', name='x')
+
+        # override ``__eq__`` according to [shape, dtype, name]
+        self.assertTrue(tensor_spec_1 == tensor_spec_2)
+        self.assertTrue(tensor_spec_1 != tensor_spec_3)  # different name
+        self.assertTrue(tensor_spec_3 != tensor_spec_4)  # different shape
+
+        # override ``__hash__``  according to [shape, dtype]
+        self.assertTrue(hash(tensor_spec_1) == hash(tensor_spec_2))
+        self.assertTrue(hash(tensor_spec_1) == hash(tensor_spec_3))
+        self.assertTrue(hash(tensor_spec_3) != hash(tensor_spec_4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b02ba1a584b52dbbc99fcc8ed7bad438e7a9dd46
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestInstanceNorm(unittest.TestCase):
+    def test_error(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+
+            def error1d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm1d = paddle.nn.InstanceNorm1d(1)
+                instance_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                instance_norm2d = paddle.nn.InstanceNorm2d(1)
+                instance_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm3d = paddle.nn.BatchNorm3d(1)
+                instance_norm3d(fluid.dygraph.to_variable(x_data_4))
+
+            with fluid.dygraph.guard(p):
+                self.assertRaises(ValueError, error1d)
+                self.assertRaises(ValueError, error2d)
+                self.assertRaises(ValueError, error3d)
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.InstanceNorm(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.InstanceNorm2d(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    ins = fluid.dygraph.InstanceNorm(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    ins = paddle.nn.InstanceNorm2d(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
index c5228fcf122748d2518238aa21ea486ed5f60d46..eaa7e711a29c7b96691f630733d913003fce9e43 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
@@ -37,10 +37,10 @@ class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
                                   use_cuda=True,
                                   use_mem_opt=False,
                                   iter_num=5):
+        paddle.manual_seed(100)
+        paddle.framework.random._manual_program_seed(100)
         prog = Program()
         startup_prog = Program()
-        prog.random_seed = 100
-        startup_prog.random_seed = 100
         with program_guard(prog, startup_prog):
             image = layers.data(name='x', shape=[784], dtype='float32')
 
diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a868e751f0567e6387b0e9471f0382c9456bcb6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+
+
+def run_static(x_np, dtype, op_str, use_gpu=False):
+    paddle.enable_static()
+    startup_program = fluid.Program()
+    main_program = fluid.Program()
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    with fluid.program_guard(main_program, startup_program):
+        x = paddle.data(name='x', shape=x_np.shape, dtype=dtype)
+        res = getattr(paddle.tensor, op_str)(x)
+        exe.run(startup_program)
+        static_result = exe.run(main_program,
+                                feed={'x': x_np},
+                                fetch_list=[res])
+    return static_result
+
+
+def run_dygraph(x_np, op_str, use_gpu=True):
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    paddle.disable_static(place)
+    x = paddle.to_variable(x_np)
+    dygraph_result = getattr(paddle.tensor, op_str)(x)
+    return dygraph_result
+
+
+def np_data_generator(low, high, np_shape, type, sv_list, op_str, *args,
+                      **kwargs):
+    x_np = np.random.uniform(low, high, np_shape).astype(getattr(np, type))
+    # x_np.shape[0] >= len(sv_list)
+    if type in ['float16', 'float32', 'float64']:
+        for i, v in enumerate(sv_list):
+            x_np[i] = v
+    ori_shape = x_np.shape
+    x_np = x_np.reshape((np.product(ori_shape), ))
+    np.random.shuffle(x_np)
+    x_np = x_np.reshape(ori_shape)
+    result_np = getattr(np, op_str)(x_np)
+    return x_np, result_np
+
+
+TEST_META_DATA = [
+    {
+        'low': 0.1,
+        'high': 1,
+        'np_shape': [8, 17, 5, 6, 7],
+        'type': 'float16',
+        'sv_list': [np.inf, np.nan]
+    },
+    {
+        'low': 0.1,
+        'high': 1,
+        'np_shape': [11, 17],
+        'type': 'float32',
+        'sv_list': [np.inf, np.nan]
+    },
+    {
+        'low': 0.1,
+        'high': 1,
+        'np_shape': [2, 3, 4, 5],
+        'type': 'float64',
+        'sv_list': [np.inf, np.nan]
+    },
+    {
+        'low': 0,
+        'high': 100,
+        'np_shape': [11, 17, 10],
+        'type': 'int32',
+        'sv_list': [np.inf, np.nan]
+    },
+    {
+        'low': 0,
+        'high': 999,
+        'np_shape': [132],
+        'type': 'int64',
+        'sv_list': [np.inf, np.nan]
+    },
+]
+
+
+def test(test_case, op_str, use_gpu=False):
+    for meta_data in TEST_META_DATA:
+        meta_data = dict(meta_data)
+        meta_data['op_str'] = op_str
+        x_np, result_np = np_data_generator(**meta_data)
+        static_result = run_static(x_np, meta_data['type'], op_str, use_gpu)
+        dygraph_result = run_dygraph(x_np, op_str, use_gpu)
+        test_case.assertTrue((static_result == result_np).all())
+        test_case.assertTrue((dygraph_result.numpy() == result_np).all())
+
+
+class TestCPUNormal(unittest.TestCase):
+    def test_inf(self):
+        test(self, 'isinf')
+
+    def test_nan(self):
+        test(self, 'isnan')
+
+    def test_finite(self):
+        test(self, 'isfinite')
+
+
+class TestCUDANormal(unittest.TestCase):
+    def test_inf(self):
+        test(self, 'isinf', True)
+
+    def test_nan(self):
+        test(self, 'isnan', True)
+
+    def test_finite(self):
+        test(self, 'isfinite', True)
+
+
+class TestError(unittest.TestCase):
+    def test_bad_input(self):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program()):
+
+            def test_isinf_bad_x():
+                x = [1, 2, 3]
+                result = paddle.tensor.isinf(x)
+
+            self.assertRaises(TypeError, test_isinf_bad_x)
+
+            def test_isnan_bad_x():
+                x = [1, 2, 3]
+                result = paddle.tensor.isnan(x)
+
+            self.assertRaises(TypeError, test_isnan_bad_x)
+
+            def test_isfinite_bad_x():
+                x = [1, 2, 3]
+                result = paddle.tensor.isfinite(x)
+
+            self.assertRaises(TypeError, test_isfinite_bad_x)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 89b12da9cf999618407c9b094dff852fc7428704..87b6e76a6d0ab7f5fba7c4526734d81475e1540e 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -15,17 +15,18 @@
 from __future__ import print_function
 
 import os
+import pickle
 import unittest
 import numpy as np
-
 import paddle
+from paddle.static import InputSpec
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME
 
 BATCH_SIZE = 32
-BATCH_NUM = 20
+BATCH_NUM = 10
 SEED = 10
 
 
@@ -79,7 +80,7 @@ class LinearNetReturnLoss(fluid.dygraph.Layer):
 
 def train(layer, input_size=784, label_size=1):
     # create optimizer
-    adam = fluid.optimizer.SGDOptimizer(
+    sgd = fluid.optimizer.SGDOptimizer(
         learning_rate=0.01, parameter_list=layer.parameters())
     # create data loader
     train_loader = fluid.io.DataLoader.from_generator(capacity=5)
@@ -96,7 +97,7 @@ def train(layer, input_size=784, label_size=1):
         avg_loss = fluid.layers.mean(loss)
 
         avg_loss.backward()
-        adam.minimize(avg_loss)
+        sgd.minimize(avg_loss)
         layer.clear_gradients()
     return [img], layer, avg_loss
 
@@ -107,7 +108,8 @@ class TestJitSaveLoad(unittest.TestCase):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
     def train_and_save_model(self, model_path=None, configs=None):
         layer = LinearNet(784, 1)
@@ -148,14 +150,14 @@ class TestJitSaveLoad(unittest.TestCase):
         train_layer.train()
         load_train_layer.train()
         # train & compare
-        _, _, train_loss = train(train_layer)
-        _, _, load_train_loss = train(load_train_layer)
+        img0, _, train_loss = train(train_layer)
+        img1, _, load_train_loss = train(load_train_layer)
         self.assertTrue(
             np.array_equal(train_loss.numpy(), load_train_loss.numpy()))
 
     def load_dygraph_state_dict(self, train_layer):
         train_layer.eval()
-        # contruct new model
+        # construct new model
         new_layer = LinearNet(784, 1)
         model_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
         new_layer.set_dict(model_dict)
@@ -175,7 +177,7 @@ class TestJitSaveLoad(unittest.TestCase):
                 model_path=self.model_path,
                 input_spec=example_inputs)
 
-    def test_load_dygraoh_no_path(self):
+    def test_load_dygraph_no_path(self):
         model_path = "model.test_jit_save_load.no_path"
         new_layer = LinearNet(784, 1)
         with self.assertRaises(ValueError):
@@ -201,12 +203,99 @@ class TestJitSaveLoad(unittest.TestCase):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
 
+class LinearNetMultiInput(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetMultiInput, self).__init__()
+        self._linear1 = Linear(in_size, out_size)
+        # self._linear2 = Linear(in_size, out_size)
+
+    @declarative(input_spec=[
+        InputSpec(
+            [None, 8], dtype='float32'), InputSpec(
+                [None, 8], dtype='float32')
+    ])
+    def forward(self, x, y):
+        x_out = self._linear1(x)
+        y_out = self._linear1(y)
+        loss = fluid.layers.mean(x_out + y_out)
+        return x_out, y_out, loss
+
+
+class TestSaveLoadWithInputSpec(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        fluid.enable_dygraph()
+
+    def test_with_input_spec(self):
+        net = LinearNetReturnLoss(8, 8)
+        # set x.shape = [None, 8]
+        net.forward = declarative(
+            net.forward, input_spec=[InputSpec(
+                [None, 8], name='x')])
+
+        model_path = "model.input_spec.output_spec"
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        # check inputs and outputs
+        self.assertTrue(len(net.forward.inputs) == 1)
+        input_x = net.forward.inputs[0]
+        self.assertTrue(input_x.shape == (-1, 8))
+        self.assertTrue(input_x.name == 'x')
+
+        # 1. prune loss
+        configs.output_spec = net.forward.outputs[:1]
+        fluid.dygraph.jit.save(net, model_path, configs=configs)
+
+        # 2. load to infer
+        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        pred = infer_layer(x)
+
+    def test_multi_in_out(self):
+        net = LinearNetMultiInput(8, 8)
+
+        model_path = "model.multi_inout.output_spec1"
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        # 1. check inputs and outputs
+        self.assertTrue(len(net.forward.inputs) == 2)
+        input_x = net.forward.inputs[0]
+        input_y = net.forward.inputs[1]
+        self.assertTrue(input_x.shape == (-1, 8))
+        self.assertTrue(input_y.shape == (-1, 8))
+
+        # 2. prune loss
+        configs.output_spec = net.forward.outputs[:2]
+        fluid.dygraph.jit.save(net, model_path, configs=configs)
+
+        # 3. load to infer
+        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        y = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        # 4. predict
+        pred_x, pred_y = infer_layer(x, y)
+
+        # 1. prune y and loss
+        model_path = "model.multi_inout.output_spec2"
+        configs.output_spec = net.forward.outputs[:1]
+        fluid.dygraph.jit.save(net, model_path, [input_x], configs)
+        # 2. load again
+        infer_layer2 = fluid.dygraph.jit.load(model_path, configs=configs)
+        # 3. predict
+        pred_xx = infer_layer2(x)
+
+        # 4. assert pred_x == pred_xx
+        self.assertTrue(np.allclose(pred_x.numpy(), pred_xx.numpy()))
+
+
 class TestJitSaveLoadConfig(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
     def basic_save_load(self, layer, model_path, configs):
         # 1. train & save
@@ -298,7 +387,8 @@ class TestJitMultipleLoading(unittest.TestCase):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         # train and save base model
         self.train_and_save_orig_model()
 
@@ -318,5 +408,77 @@ class TestJitMultipleLoading(unittest.TestCase):
             name_set.add(var.name)
 
 
+class LinearNetReturnHidden(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetReturnHidden, self).__init__()
+        self._linear_1 = Linear(in_size, out_size)
+        self._linear_2 = Linear(in_size, out_size)
+
+    @declarative
+    def forward(self, x):
+        y = self._linear_1(x)
+        z = self._linear_2(y)
+        loss = fluid.layers.mean(z)
+        return y, loss
+
+
+class TestJitPruneModelAndLoad(unittest.TestCase):
+    def setUp(self):
+        self.linear_size = 4
+        self.model_path = "model.jit_prune_model_and_load"
+        # enable dygraph mode
+        fluid.enable_dygraph()
+        # config seed
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+    def train_and_save(self):
+        train_layer = LinearNetReturnHidden(8, 8)
+        adam = fluid.optimizer.AdamOptimizer(
+            learning_rate=0.1, parameter_list=train_layer.parameters())
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        for i in range(10):
+            hidden, loss = train_layer(x)
+            loss.backward()
+            adam.minimize(loss)
+            train_layer.clear_gradients()
+
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        configs.output_spec = [hidden]
+        fluid.dygraph.jit.save(
+            layer=train_layer,
+            model_path=self.model_path,
+            input_spec=[x],
+            configs=configs)
+
+        return train_layer
+
+    def test_load_pruned_model(self):
+        train_layer = self.train_and_save()
+        train_layer.eval()
+
+        infer_layer = fluid.dygraph.jit.load(self.model_path)
+
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        self.assertTrue(
+            np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
+
+    def test_load_var_not_in_extra_var_info(self):
+        self.train_and_save()
+
+        # chage extra var info
+        var_info_path = os.path.join(self.model_path, EXTRA_VAR_INFO_FILENAME)
+        with open(var_info_path, 'rb') as f:
+            extra_var_info = pickle.load(f)
+            extra_var_info.clear()
+        with open(var_info_path, 'wb') as f:
+            pickle.dump(extra_var_info, f, protocol=2)
+
+        with self.assertRaises(RuntimeError):
+            fluid.dygraph.jit.load(self.model_path)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f324e4bd377c616fb14b2b6df2b936b04ed76ff5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestDygraphLayerNormv2(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    ln = paddle.nn.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ln(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    ln = paddle.nn.LayerNorm(shape[1:])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ln(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 91186b2e95ae00b15503dc29feb1c2b1039e744f..eb79a80da99fabd18ad9a8cbdd8f6aba08209d05 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -57,8 +57,8 @@ class LayerTest(unittest.TestCase):
     @contextlib.contextmanager
     def static_graph(self):
         with new_program_scope():
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             yield
 
     def get_static_graph_result(self,
@@ -77,8 +77,8 @@ class LayerTest(unittest.TestCase):
     def dynamic_graph(self, force_to_use_cpu=False):
         with fluid.dygraph.guard(
                 self._get_place(force_to_use_cpu=force_to_use_cpu)):
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             yield
 
 
@@ -287,7 +287,7 @@ class TestLayer(LayerTest):
         if core.is_compiled_with_cuda():
             with self.static_graph():
                 t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32')
-                my_sync_bn = nn.SyncBatchNorm(3)
+                my_sync_bn = paddle.nn.SyncBatchNorm(3)
                 ret = my_sync_bn(t)
                 static_ret = self.get_static_graph_result(
                     feed={'t': np.ones(
@@ -316,21 +316,6 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
 
-    def test_leakyrelu(self):
-        inputs = np.random.uniform(-1, 1, (10, 10)).astype('float32')
-        with self.static_graph():
-            t = layers.data(name='t', shape=[10, 10], dtype='float32')
-            ret = layers.leaky_relu(t, alpha=0.01)
-            static_ret = self.get_static_graph_result(
-                feed={'t': inputs}, fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            lrelu = paddle.nn.LeakyReLU(alpha=0.01)
-            dy_ret = lrelu(base.to_variable(inputs))
-            dy_ret_value = dy_ret.numpy()
-
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-
     def test_pad2d(self):
         with self.static_graph():
             t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32')
@@ -1049,7 +1034,7 @@ class TestLayer(LayerTest):
             static_rlt2 = self.get_static_graph_result(
                 feed=feed_dict, fetch_list=[nce_loss2])[0]
 
-        with self.dynamic_graph(force_to_use_cpu=True):
+        with self.dynamic_graph():
             words = []
             for i in range(window_size):
                 words.append(base.to_variable(inp_word[i]))
@@ -1085,7 +1070,7 @@ class TestLayer(LayerTest):
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
         self.assertTrue(np.allclose(dy_rlt_value, static_rlt))
 
-        with self.dynamic_graph(force_to_use_cpu=True):
+        with self.dynamic_graph():
             custom_weight = np.random.randn(dict_size, 128).astype("float32")
             weight_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.NumpyArrayInitializer(
@@ -2011,13 +1996,13 @@ class TestLayer(LayerTest):
             exe = fluid.Executor(place)
 
             exe.run(fluid.default_startup_program())
-            x = np.random.rand(3, 32, 32).astype("float32")
-            y = np.array([[1], [0], [1]])
+            # x = np.random.rand(3, 32, 32).astype("float32")
+            # y = np.array([[1], [0], [1]])
             static_out = exe.run(feed={"input": x,
                                        "label": y},
                                  fetch_list=result[0])
 
-        with self.dynamic_graph():
+        with self.dynamic_graph(force_to_use_cpu=True):
             data = base.to_variable(x)
             label = base.to_variable(y)
             fc_out = fluid.layers.fc(data, size=10)
@@ -2678,13 +2663,6 @@ class TestBook(LayerTest):
             out = layers.brelu(input, t_min=1.0, t_max=20.0, name='brelu')
             return (out)
 
-    def make_leaky_relu(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.leaky_relu(input, alpha=0.1, name='leaky_relu')
-            return (out)
-
     def make_soft_relu(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
@@ -3704,5 +3682,32 @@ class TestBook(LayerTest):
                         batch_first=batch_first)
 
 
+class TestMetricsDetectionMap(unittest.TestCase):
+    def test_detection_map(self):
+        program = fluid.Program()
+        with program_guard(program):
+            detect_res = fluid.layers.data(
+                name='detect_res',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+            label = fluid.layers.data(
+                name='label',
+                shape=[10, 1],
+                append_batch_size=False,
+                dtype='float32')
+            box = fluid.layers.data(
+                name='bbox',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            map_eval = fluid.metrics.DetectionMAP(
+                detect_res, label, box, class_num=21)
+            cur_map, accm_map = map_eval.get_map_var()
+            self.assertIsNotNone(cur_map)
+            self.assertIsNotNone(accm_map)
+        print(str(program))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index 71b452d4a2dd192c756599eb24949084bfa0860e..36368a83893c7eea3e5842638b3fc677e1a1b936 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -19,6 +19,7 @@ import math
 import numpy as np
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.framework as framework
@@ -522,111 +523,5 @@ class TestLinearWamrupLearningRateDecayWithScalarInput(unittest.TestCase):
         run_places(lr, start_lr, end_lr)
 
 
-def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss,
-                         var_list):
-    def is_better(current, best, m, n):
-        if m == 'min' and n == 'rel':
-            return current < best - best * threshold
-        elif m == 'min' and n == 'abs':
-            return current < best - threshold
-        elif m == 'max' and n == 'rel':
-            return current > best + best * threshold
-        else:  # mode == 'max' and epsilon_mode == 'abs':
-            return current > best + threshold
-
-    if var_list[2] > 0:
-        var_list[2] -= 1
-        return var_list[1]
-
-    if is_better(loss, var_list[0], m, n):
-        var_list[0] = loss
-        var_list[3] = 0
-    else:
-        var_list[3] += 1
-        if var_list[3] > patience:
-            var_list[2] = cooldown
-            var_list[3] = 0
-            new_lr = var_list[1] * decay_rate
-            var_list[1] = new_lr if var_list[1] - new_lr > 1e-8 else var_list[1]
-
-    return var_list[1]
-
-
-class TestReduceLROnPlateauDecay(unittest.TestCase):
-    def test_dygraph_mode(self):
-        with fluid.dygraph.guard():
-            # the decay rate must be less than 1.0
-            with self.assertRaises(ValueError):
-                fluid.dygraph.ReduceLROnPlateau(
-                    learning_rate=1.0, decay_rate=2.0)
-            # the mode must be "min" or "max"
-            with self.assertRaises(ValueError):
-                fluid.dygraph.ReduceLROnPlateau(learning_rate=1.0, mode="test")
-            # the threshold_mode must be "rel" or "abs"
-            with self.assertRaises(ValueError):
-                fluid.dygraph.ReduceLROnPlateau(
-                    learning_rate=1.0, threshold_mode="test")
-
-            base_lr = 1.0
-            patience = 3
-            cooldown = 1
-            decay_rate = 0.5
-            threshold = 1e-4
-            linear = fluid.dygraph.Linear(10, 10)
-
-            for m, n in zip(['min', 'max', 'min', 'max'],
-                            ['rel', 'rel', 'abs', 'abs']):
-                kwargs = {
-                    'learning_rate': base_lr,
-                    'decay_rate': decay_rate,
-                    'threshold': threshold,
-                    'verbose': True,
-                    'patience': patience,
-                    'cooldown': cooldown,
-                    'mode': m,
-                    'threshold_mode': n,
-                    'eps': 1e-6
-                }
-                print("class=" + fluid.dygraph.ReduceLROnPlateau.__name__ +
-                      " kwargs=" + str(kwargs))
-                lr = fluid.dygraph.ReduceLROnPlateau(**kwargs)
-                sgd = fluid.optimizer.SGD(learning_rate=lr,
-                                          parameter_list=linear.parameters())
-
-                best = float("-10000") if m == "max" else float("10000")
-                expected_lr = 1.0
-                cooldown_counter = 0
-                num_bad_epochs = 0
-                var_list = [best, expected_lr, cooldown_counter, num_bad_epochs]
-                step_num = 0
-                epoch_num = 0
-                for epoch in range(30):
-                    total_loss = 0
-
-                    for batch_id in range(2):
-                        step_num += 1
-                        x = fluid.dygraph.to_variable(
-                            np.array([step_num]).astype('float32'))
-                        loss = layers.sin(x)
-                        sgd.minimize(loss)
-                        total_loss += loss
-
-                    epoch_num += 1
-                    # get expected lr from fluid
-                    avg_loss = total_loss / 1
-                    lr.step(avg_loss)
-                    actual_lr = lr().numpy()[0]
-
-                    # get expected lr form python
-                    expected_lr = reduce_lr_on_plateau(decay_rate, threshold,
-                                                       cooldown, patience, m, n,
-                                                       avg_loss, var_list)
-                    self.assertEqual(
-                        expected_lr,
-                        actual_lr,
-                        msg='Failed reduce lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.
-                        format(epoch_num, expected_lr, actual_lr))
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linear.py b/python/paddle/fluid/tests/unittests/test_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d07a80da15dbfd35ffdedbcb09e82d59a84486e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_linear.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import paddle.fluid.initializer as I
+
+
+class LinearTestCase(unittest.TestCase):
+    def setUp(self):
+        self.dtype = 'float32'
+        self.input = np.ones((3, 1, 2)).astype(self.dtype)
+        self.weight = np.ones((2, 2)).astype(self.dtype)
+        self.bias = np.ones((2)).astype(self.dtype)
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+
+    def functional(self, place):
+        paddle.disable_static(place)
+        input = paddle.to_tensor(self.input)
+        weight = paddle.to_tensor(self.weight)
+        bias = paddle.to_tensor(self.bias)
+        out = F.linear(input, weight, bias)
+        return out.numpy()
+
+    def paddle_nn_layer(self, place):
+        paddle.disable_static(place)
+        input = paddle.to_tensor(self.input)
+        weight_attr = fluid.ParamAttr(
+            name="linear_weight",
+            learning_rate=1.0,
+            trainable=False,
+            regularizer=None,
+            initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
+        bias_attr = fluid.ParamAttr(
+            name="linear_bias",
+            learning_rate=1.0,
+            trainable=False,
+            regularizer=None,
+            initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
+        linear = paddle.nn.Linear(
+            2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
+        y = linear(input)
+        return y.numpy()
+
+    def numpy_cal(self):
+        res = np.matmul(self.input, self.weight) + self.bias
+        return res
+
+    def test_error(self, place=paddle.CPUPlace()):
+        res_f = self.functional(place)
+        res_nn = self.paddle_nn_layer(place)
+        res_np = self.numpy_cal()
+        np.testing.assert_array_almost_equal(res_f, res_nn)
+        np.testing.assert_array_almost_equal(res_nn, res_np)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
index 98f7cd5b6b2dc8c82a71edf7ec36a24921726e3c..53e8b02081ae3acf8a7fb5dd2bc6e05cbc3be901 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
-from paddle.nn.functional import *
+from paddle.nn.functional import interpolate
 
 
 def linear_interp_np(input,
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..04b56677fc158583fe79ec0dc1276210bd2ebbdc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
@@ -0,0 +1,438 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import platform
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+from paddle.nn.functional import interpolate
+
+
+def linear_interp_np(input,
+                     out_w,
+                     out_size=None,
+                     actual_shape=None,
+                     align_corners=True,
+                     align_mode=0,
+                     data_layout='NCHW'):
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 2, 1))  # NHWC => NCHW
+    if out_size is not None:
+        out_w = out_size[0]
+    if actual_shape is not None:
+        out_w = actual_shape[0]
+    batch_size, channel, in_w = input.shape
+
+    ratio_w = 0.0
+    if out_w > 1:
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((batch_size, channel, out_w))
+
+    for j in range(out_w):
+        if (align_mode == 0 and not align_corners):
+            w = int(ratio_w * (j + 0.5) - 0.5)
+        else:
+            w = int(ratio_w * j)
+        w = max(0, w)
+        wid = 1 if w < in_w - 1 else 0
+
+        if (align_mode == 0 and not align_corners):
+            idx_src_w = max(ratio_w * (j + 0.5) - 0.5, 0)
+            w1lambda = idx_src_w - w
+        else:
+            w1lambda = ratio_w * j - w
+        w2lambda = 1.0 - w1lambda
+
+        out[:, :, j] = w2lambda * input[:, :, w] + w1lambda * input[:, :, w +
+                                                                    wid]
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 1))  # NCHW => NHWC
+
+    return out.astype(input.dtype)
+
+
+class TestLinearInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "linear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+
+        if self.data_layout == "NCHW":
+            in_w = self.input_shape[2]
+        else:
+            in_w = self.input_shape[1]
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = float(self.scale)
+            if isinstance(self.scale, list):
+                self.scale = float(self.scale[0])
+            out_w = int(in_w * self.scale)
+        else:
+            out_w = self.out_w
+
+        output_np = linear_interp_np(input_np, out_w, self.out_size,
+                                     self.actual_shape, self.align_corners,
+                                     self.align_mode, self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+
+        self.attrs = {
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': self.data_layout
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [float(self.scale)]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        if platform.system() == "Linux":
+            self.check_output(atol=1e-7)
+        else:
+            self.check_output(atol=1e-5)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [1, 3, 100]
+        self.out_w = 50
+        self.scale = 0.
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestLinearInterpOpDataLayout(TestLinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [1, 3, 100]
+        self.out_w = 50
+        self.scale = 0.
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = False
+        self.align_mode = 1
+        self.data_layout = 'NHWC'
+
+
+class TestLinearInterpOpAlignMode(TestLinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [1, 3, 100]
+        self.out_w = 50
+        self.scale = 0.
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestLinearInterpOpScale(TestLinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [1, 3, 100]
+        self.out_w = 50
+        self.scale = 0.5
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestLinearInterpOpSizeTensor(TestLinearInterpOp):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "linear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+        self.shape_by_1Dtensor = False
+        self.scale_by_1Dtensor = False
+
+        if self.data_layout == "NCHW":
+            in_w = self.input_shape[2]
+        else:
+            in_w = self.input_shape[1]
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = float(self.scale)
+            if isinstance(self.scale, list):
+                self.scale = float(self.scale[0])
+            out_w = int(in_w * self.scale)
+        else:
+            out_w = self.out_w
+
+        output_np = linear_interp_np(input_np, out_w, self.out_size,
+                                     self.actual_shape, self.align_corners,
+                                     self.align_mode, self.data_layout)
+
+        self.inputs = {'X': input_np}
+        if self.out_size is not None and self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.out_size
+        elif self.actual_shape is not None and self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.actual_shape
+        else:
+            size_tensor = []
+            for index, ele in enumerate(self.out_size):
+                size_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+            self.inputs['SizeTensor'] = size_tensor
+
+        self.attrs = {
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': self.data_layout
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+
+class TestResizeLinearAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.data(name="x", shape=[1, 3, 64], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[1], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[1], dtype="int32")
+        scale_tensor = fluid.data(
+            name="scale_tensor", shape=[1], dtype="float32")
+
+        out1 = fluid.layers.resize_linear(
+            x, out_shape=[128, ], align_mode=1, align_corners=False)
+        out2 = fluid.layers.resize_linear(
+            x, out_shape=[128], align_mode=1, align_corners=False)
+        out3 = fluid.layers.resize_linear(
+            x, out_shape=shape_tensor, align_mode=1, align_corners=False)
+        out4 = fluid.layers.resize_linear(
+            x,
+            out_shape=[128, ],
+            actual_shape=actual_size,
+            align_mode=1,
+            align_corners=False)
+        out5 = fluid.layers.resize_linear(
+            x, scale=scale_tensor, align_mode=1, align_corners=False)
+
+        out6 = interpolate(
+            x,
+            scale_factor=scale_tensor,
+            mode='linear',
+            align_mode=1,
+            align_corners=False,
+            data_format='NCW')
+        out7 = interpolate(
+            x,
+            size=[128, ],
+            mode='linear',
+            align_mode=1,
+            align_corners=False,
+            data_format='NCW')
+        out8 = interpolate(
+            x,
+            size=shape_tensor,
+            mode='linear',
+            align_mode=1,
+            align_corners=False,
+            data_format='NCW')
+
+        x_data = np.random.random((1, 3, 64)).astype("float32")
+        dim_data = np.array([128]).astype("int32")
+        shape_data = np.array([128, ]).astype("int32")
+        actual_size_data = np.array([128, ]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(
+            fluid.default_main_program(),
+            feed={
+                "x": x_data,
+                "dim": dim_data,
+                "shape_tensor": shape_data,
+                "actual_size": actual_size_data,
+                "scale_tensor": scale_data
+            },
+            fetch_list=[out1, out2, out3, out4, out5, out6, out7, out8],
+            return_numpy=True)
+
+        expect_res = linear_interp_np(
+            x_data, out_w=128, align_mode=1, align_corners=False)
+        for res in results:
+            self.assertTrue(np.allclose(res, expect_res))
+
+
+class TestLinearInterpOpAPI2_0(unittest.TestCase):
+    def test_case(self):
+
+        # dygraph 
+        x_data = np.random.random((1, 3, 128)).astype("float32")
+        us_1 = paddle.nn.UpSample(
+            size=[64, ],
+            mode='linear',
+            align_mode=1,
+            align_corners=False,
+            data_format='NCW')
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(x_data)
+            interp = us_1(x)
+
+            expect = linear_interp_np(
+                x_data, out_w=64, align_mode=1, align_corners=False)
+
+            self.assertTrue(np.allclose(interp.numpy(), expect))
+
+
+class TestResizeLinearOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "linear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("uint8")
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = float(self.scale)
+            if isinstance(self.scale, list):
+                self.scale = float(self.scale[0])
+            out_w = int(self.input_shape[2] * self.scale)
+        else:
+            out_w = self.out_w
+
+        output_np = linear_interp_np(input_np, out_w, self.out_size,
+                                     self.actual_shape, self.align_corners,
+                                     self.align_mode)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+
+        self.attrs = {
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        if platform.system() == "Linux":
+            self.check_output_with_place(place=core.CPUPlace(), atol=1e-7)
+        else:
+            self.check_output_with_place(place=core.CPUPlace(), atol=1e-5)
+
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [2, 3, 100]
+        self.out_w = 50
+        self.scale = 0.
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestLinearInterpOpException(unittest.TestCase):
+    def test_exception(self):
+        def input_shape_error():
+            x1 = fluid.data(name="x1", shape=[1], dtype="float32")
+            out = fluid.layers.resize_linear(
+                x1, out_shape=[256, ], data_format='NCW')
+
+        def data_format_error():
+            x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
+            out = fluid.layers.resize_linear(
+                x2, out_shape=[256, ], data_format='NHWCD')
+
+        def out_shape_error():
+            x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
+            out = fluid.layers.resize_linear(
+                x3, out_shape=[
+                    256,
+                    256,
+                ], data_format='NHWC')
+
+        self.assertRaises(ValueError, input_shape_error)
+        self.assertRaises(ValueError, data_format_error)
+        self.assertRaises(ValueError, out_shape_error)
+
+
+class TestLinearInterpOpError(unittest.TestCase):
+    def test_error(self):
+        with program_guard(Program(), Program()):
+
+            def input_shape_error():
+                x1 = fluid.data(name="x1", shape=[1], dtype="float32")
+                out1 = paddle.nn.UpSample(
+                    size=[256, ], data_format='NCW', mode='linear')
+                out1_res = out1(x1)
+
+            def data_format_error():
+                x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
+                out2 = paddle.nn.UpSample(
+                    size=[256, ], data_format='NHWCD', mode='linear')
+                out2_res = out2(x2)
+
+            def out_shape_error():
+                x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
+                out3 = paddle.nn.UpSample(
+                    size=[
+                        256,
+                        256,
+                    ], data_format='NHWC', mode='linear')
+                out3_res = out3(x3)
+
+            self.assertRaises(ValueError, input_shape_error)
+            self.assertRaises(ValueError, data_format_error)
+            self.assertRaises(ValueError, out_shape_error)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py
index 068993c4c1c5e770dd6cf7dc7a35b9ccc3f49aae..6d1f42111eebff0f469317ddf2a9ec7698a7ae1e 100644
--- a/python/paddle/fluid/tests/unittests/test_linspace.py
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
@@ -32,6 +32,7 @@ class TestLinspaceOpCommonCase(OpTest):
             'Stop': np.array([10]).astype(dtype),
             'Num': np.array([11]).astype('int32')
         }
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
 
         self.outputs = {'Out': np.arange(0, 11).astype(dtype)}
 
@@ -48,6 +49,7 @@ class TestLinspaceOpReverseCase(OpTest):
             'Stop': np.array([0]).astype(dtype),
             'Num': np.array([11]).astype('int32')
         }
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
 
         self.outputs = {'Out': np.arange(10, -1, -1).astype(dtype)}
 
@@ -64,6 +66,7 @@ class TestLinspaceOpNumOneCase(OpTest):
             'Stop': np.array([0]).astype(dtype),
             'Num': np.array([1]).astype('int32')
         }
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
 
         self.outputs = {'Out': np.array(10, dtype=dtype)}
 
@@ -72,6 +75,26 @@ class TestLinspaceOpNumOneCase(OpTest):
 
 
 class TestLinspaceAPI(unittest.TestCase):
+    def test_variable_input1(self):
+        start = paddle.full(shape=[1], fill_value=0, dtype='float32')
+        stop = paddle.full(shape=[1], fill_value=10, dtype='float32')
+        num = paddle.full(shape=[1], fill_value=5, dtype='int32')
+        out = paddle.linspace(start, stop, num, dtype='float32')
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        res = exe.run(fluid.default_main_program(), fetch_list=[out])
+        np_res = np.linspace(0, 10, 5, dtype='float32')
+        self.assertEqual((res == np_res).all(), True)
+
+    def test_variable_input2(self):
+        paddle.disable_static()
+        start = paddle.full(shape=[1], fill_value=0, dtype='float32')
+        stop = paddle.full(shape=[1], fill_value=10, dtype='float32')
+        num = paddle.full(shape=[1], fill_value=5, dtype='int32')
+        out = paddle.linspace(start, stop, num, dtype='float32')
+        np_res = np.linspace(0, 10, 5, dtype='float32')
+        self.assertEqual((out.numpy() == np_res).all(), True)
+        paddle.enable_static()
+
     def test_dtype(self):
         out_1 = paddle.linspace(0, 10, 5, dtype='float32')
         out_2 = paddle.linspace(0, 10, 5, dtype=np.float32)
@@ -89,10 +112,16 @@ class TestLinspaceAPI(unittest.TestCase):
 
     def test_imperative(self):
         paddle.disable_static()
-        out = paddle.linspace(0, 10, 5, dtype='float32')
-        np_out = np.linspace(0, 10, 5, dtype='float32')
+        out1 = paddle.linspace(0, 10, 5, dtype='float32')
+        np_out1 = np.linspace(0, 10, 5, dtype='float32')
+        out2 = paddle.linspace(0, 10, 5, dtype='int32')
+        np_out2 = np.linspace(0, 10, 5, dtype='int32')
+        out3 = paddle.linspace(0, 10, 200, dtype='int32')
+        np_out3 = np.linspace(0, 10, 200, dtype='int32')
         paddle.enable_static()
-        self.assertEqual((out.numpy() == np_out).all(), True)
+        self.assertEqual((out1.numpy() == np_out1).all(), True)
+        self.assertEqual((out2.numpy() == np_out2).all(), True)
+        self.assertEqual((out3.numpy() == np_out3).all(), True)
 
 
 class TestLinspaceOpError(unittest.TestCase):
@@ -100,7 +129,12 @@ class TestLinspaceOpError(unittest.TestCase):
         with program_guard(Program(), Program()):
 
             def test_dtype():
-                fluid.layers.linspace(0, 10, 1, dtype="int32")
+                fluid.layers.linspace(0, 10, 1, dtype="int8")
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_dtype():
+                fluid.layers.linspace(0, 10, 1.33, dtype="int32")
 
             self.assertRaises(TypeError, test_dtype)
 
@@ -120,20 +154,20 @@ class TestLinspaceOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_step_dtype)
 
             def test_start_dtype():
-                start = fluid.data(shape=[1], type="int32", name="start")
+                start = fluid.data(shape=[1], dtype="int32", name="start")
                 fluid.layers.linspace(start, 10, 1, dtype="float32")
 
             self.assertRaises(TypeError, test_start_dtype)
 
             def test_end_dtype():
-                end = fluid.data(shape=[1], type="int32", name="end")
+                end = fluid.data(shape=[1], dtype="int32", name="end")
                 fluid.layers.linspace(0, end, 1, dtype="float32")
 
             self.assertRaises(TypeError, test_end_dtype)
 
-            def test_step_dtype():
-                step = fluid.data(shape=[1], type="int32", name="step")
-                fluid.layers.linspace(0, 10, step, dtype="float32")
+            def test_num_dtype():
+                num = fluid.data(shape=[1], dtype="int32", name="step")
+                fluid.layers.linspace(0, 10, num, dtype="float32")
 
             self.assertRaises(TypeError, test_step_dtype)
 
diff --git a/python/paddle/fluid/tests/unittests/test_logical_op.py b/python/paddle/fluid/tests/unittests/test_logical_op.py
old mode 100644
new mode 100755
index 8f0049a8d30d0e1fed1d27cf6e13c036e33678d0..c8bb8c5b73f7680fc8a329656ef2b899f14d96ea
--- a/python/paddle/fluid/tests/unittests/test_logical_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logical_op.py
@@ -17,51 +17,235 @@ from __future__ import print_function
 import op_test
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
+from paddle.static import Program, program_guard
 
+TEST_META_OP_DATA = [{
+    'op_str': 'logical_and',
+    'binary_op': True
+}, {
+    'op_str': 'logical_or',
+    'binary_op': True
+}, {
+    'op_str': 'logical_xor',
+    'binary_op': True
+}, {
+    'op_str': 'logical_not',
+    'binary_op': False
+}]
 
-def create_test_class(op_type, callback, binary_op=True):
-    class Cls(op_test.OpTest):
-        def setUp(self):
-            a = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
-            if binary_op:
-                b = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
-                c = callback(a, b)
-            else:
-                c = callback(a)
-            self.outputs = {'Out': c}
-            self.op_type = op_type
-            if binary_op:
-                self.inputs = {'X': a, 'Y': b}
+TEST_META_SHAPE_DATA = {
+    'XDimLargerThanYDim1': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [4, 5]
+    },
+    'XDimLargerThanYDim2': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [4, 1]
+    },
+    'XDimLargerThanYDim3': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [1, 4, 1]
+    },
+    'XDimLargerThanYDim4': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [3, 4, 1]
+    },
+    'XDimLargerThanYDim5': {
+        'x_shape': [2, 3, 1, 5],
+        'y_shape': [3, 1, 1]
+    },
+    'XDimLessThanYDim1': {
+        'x_shape': [4, 1],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'XDimLessThanYDim2': {
+        'x_shape': [1, 4, 1],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'XDimLessThanYDim3': {
+        'x_shape': [3, 4, 1],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'XDimLessThanYDim4': {
+        'x_shape': [3, 1, 1],
+        'y_shape': [2, 3, 1, 5]
+    },
+    'XDimLessThanYDim5': {
+        'x_shape': [4, 5],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'Axis1InLargerDim': {
+        'x_shape': [1, 4, 5],
+        'y_shape': [2, 3, 1, 5]
+    },
+    'EqualDim1': {
+        'x_shape': [10, 7],
+        'y_shape': [10, 7]
+    },
+    'EqualDim2': {
+        'x_shape': [1, 1, 4, 5],
+        'y_shape': [2, 3, 1, 5]
+    }
+}
+
+TEST_META_WRONG_SHAPE_DATA = {
+    'ErrorDim1': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [3, 4]
+    },
+    'ErrorDim2': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [4, 3]
+    }
+}
+
+
+def run_static(x_np, y_np, op_str, use_gpu=False, binary_op=True):
+    paddle.enable_static()
+    startup_program = fluid.Program()
+    main_program = fluid.Program()
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    with fluid.program_guard(main_program, startup_program):
+        x = paddle.static.data(name='x', shape=x_np.shape, dtype='bool')
+        op = getattr(paddle, op_str)
+        feed_list = {'x': x_np}
+        if not binary_op:
+            res = op(x)
+        else:
+            y = paddle.static.data(name='y', shape=y_np.shape, dtype='bool')
+            feed_list['y'] = y_np
+            res = op(x, y)
+        exe.run(startup_program)
+        static_result = exe.run(main_program, feed=feed_list, fetch_list=[res])
+    return static_result
+
+
+def run_dygraph(x_np, y_np, op_str, use_gpu=False, binary_op=True):
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    paddle.disable_static(place)
+    op = getattr(paddle, op_str)
+    x = paddle.to_tensor(x_np)
+    if not binary_op:
+        dygraph_result = op(x)
+    else:
+        y = paddle.to_tensor(y_np)
+        dygraph_result = op(x, y)
+    return dygraph_result
+
+
+def np_data_generator(np_shape, *args, **kwargs):
+    return np.random.choice(a=[True, False], size=np_shape).astype(bool)
+
+
+def test(unit_test, use_gpu=False, test_error=False):
+    for op_data in TEST_META_OP_DATA:
+        meta_data = dict(op_data)
+        meta_data['use_gpu'] = use_gpu
+        np_op = getattr(np, meta_data['op_str'])
+        META_DATA = dict(TEST_META_SHAPE_DATA)
+        if test_error:
+            META_DATA = dict(TEST_META_WRONG_SHAPE_DATA)
+        for shape_data in META_DATA.values():
+            meta_data['x_np'] = np_data_generator(shape_data['x_shape'])
+            meta_data['y_np'] = np_data_generator(shape_data['y_shape'])
+            if meta_data['binary_op'] and test_error:
+                # catch C++ Exception
+                unit_test.assertRaises(BaseException, run_static, **meta_data)
+                unit_test.assertRaises(BaseException, run_dygraph, **meta_data)
+                continue
+            static_result = run_static(**meta_data)
+            dygraph_result = run_dygraph(**meta_data)
+            if meta_data['binary_op']:
+                np_result = np_op(meta_data['x_np'], meta_data['y_np'])
             else:
-                self.inputs = {'X': a}
-
-        def test_output(self):
-            self.check_output()
-
-        def test_error(self):
-            with program_guard(Program(), Program()):
-                x = fluid.layers.data(name='x', shape=[2], dtype='bool')
-                y = fluid.layers.data(name='y', shape=[2], dtype='bool')
-                a = fluid.layers.data(name='a', shape=[2], dtype='int32')
-                op = eval("fluid.layers.%s" % self.op_type)
-                if self.op_type != "logical_not":
-                    self.assertRaises(TypeError, op, x=x, y=y, out=1)
-                    self.assertRaises(TypeError, op, x=x, y=a)
-                    self.assertRaises(TypeError, op, x=a, y=y)
-                else:
-                    self.assertRaises(TypeError, op, x=x, out=1)
-                    self.assertRaises(TypeError, op, x=a)
-
-    Cls.__name__ = op_type
-    globals()[op_type] = Cls
-
-
-create_test_class('logical_and', lambda _a, _b: np.logical_and(_a, _b))
-create_test_class('logical_or', lambda _a, _b: np.logical_or(_a, _b))
-create_test_class('logical_not', lambda _a: np.logical_not(_a), False)
-create_test_class('logical_xor', lambda _a, _b: np.logical_xor(_a, _b))
+                np_result = np_op(meta_data['x_np'])
+            unit_test.assertTrue((static_result == np_result).all())
+            unit_test.assertTrue((dygraph_result.numpy() == np_result).all())
+
+
+def test_type_error(unit_test, use_gpu, type_str_map):
+    def check_type(op_str, x, y, binary_op):
+        op = getattr(paddle, op_str)
+        error_type = TypeError
+        if isinstance(x, np.ndarray):
+            x = paddle.to_tensor(x)
+            y = paddle.to_tensor(y)
+            error_type = BaseException
+        if binary_op:
+            if type_str_map['x'] != 'bool' or type_str_map['y'] != 'bool':
+                unit_test.assertRaises(error_type, op, x=x, y=y)
+            if not fluid.in_dygraph_mode():
+                unit_test.assertRaises(error_type, op, x=x, y=y, out=1)
+        else:
+            if type_str_map['x'] != 'bool':
+                unit_test.assertRaises(error_type, op, x=x)
+            if not fluid.in_dygraph_mode():
+                unit_test.assertRaises(error_type, op, x=x, out=1)
+
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    for op_data in TEST_META_OP_DATA:
+        meta_data = dict(op_data)
+        binary_op = meta_data['binary_op']
+
+        paddle.disable_static(place)
+        x = np.random.choice(a=[0, 1], size=[10]).astype(type_str_map['x'])
+        y = np.random.choice(a=[0, 1], size=[10]).astype(type_str_map['y'])
+        check_type(meta_data['op_str'], x, y, binary_op)
+
+        paddle.enable_static()
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.static.data(
+                name='x', shape=[10], dtype=type_str_map['x'])
+            y = paddle.static.data(
+                name='y', shape=[10], dtype=type_str_map['y'])
+            check_type(meta_data['op_str'], x, y, binary_op)
+
+
+def type_map_factory():
+    x_type_list = ['float32', 'float64', 'int32', 'int64', 'bool']
+    y_type_list = ['float32', 'float64', 'int32', 'int64', 'bool']
+    return [{
+        'x': x_type,
+        'y': y_type
+    } for x_type in x_type_list for y_type in y_type_list]
+
+
+class TestCPU(unittest.TestCase):
+    def test(self):
+        test(self)
+
+    def test_error(self):
+        test(self, False, True)
+
+    def test_type_error(self):
+        type_map_list = type_map_factory()
+        for type_map in type_map_list:
+            test_type_error(self, False, type_map)
+
+
+class TestCUDA(unittest.TestCase):
+    def test(self):
+        test(self, True)
+
+    def test_error(self):
+        test(self, True, True)
+
+    def test_type_error(self):
+        type_map_list = type_map_factory()
+        for type_map in type_map_list:
+            test_type_error(self, True, type_map)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py
index 508b4a7b72da8affbc7ddf590b8142a41d1f3191..c2201a52605bc87246fb9c8734494b19f83ff180 100644
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -12,64 +12,128 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
 import paddle
-import paddle.fluid as fluid
 import unittest
 import numpy as np
 from op_test import OpTest
-from paddle.fluid import Program, program_guard
-from paddle.fluid.layer_helper import LayerHelper
 
 
-class TestLogSumOpError(unittest.TestCase):
+def ref_logsumexp(x, axis=None, keepdim=False, reduce_all=False):
+    if isinstance(axis, int):
+        axis = (axis, )
+    elif isinstance(axis, list):
+        axis = tuple(axis)
+    if reduce_all:
+        axis = None
+    out = np.log(np.exp(x).sum(axis=axis, keepdims=keepdim))
+    return out
+
+
+class TestLogsumexp(OpTest):
+    def setUp(self):
+        self.op_type = 'logsumexp'
+        self.shape = [2, 3, 4, 5]
+        self.dtype = 'float64'
+        self.axis = [-1]
+        self.keepdim = False
+        self.reduce_all = False
+        self.set_attrs()
+
+        np.random.seed(10)
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out = ref_logsumexp(x, self.axis, self.keepdim, self.reduce_all)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {
+            'dim': self.axis,
+            'keep_dim': self.keepdim,
+            'reduce_all': self.reduce_all
+        }
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], ['Out'])
+
+
+class TestLogsumexp_shape(TestLogsumexp):
+    def set_attrs(self):
+        self.shape = [4, 5, 6]
+
+
+class TestLogsumexp_axis(TestLogsumexp):
+    def set_attrs(self):
+        self.axis = [0, -1]
+
+
+class TestLogsumexp_axis_all(TestLogsumexp):
+    def set_attrs(self):
+        self.axis = [0, 1, 2, 3]
+
+
+class TestLogsumexp_keepdim(TestLogsumexp):
+    def set_attrs(self):
+        self.keepdim = True
+
+
+class TestLogsumexp_reduce_all(TestLogsumexp):
+    def set_attrs(self):
+        self.reduce_all = True
+
+
+class TestLogsumexpError(unittest.TestCase):
     def test_errors(self):
-        with program_guard(Program(), Program()):
-
-            x1 = fluid.layers.data(name='x1', shape=[120], dtype="uint8")
-            self.assertRaises(Exception, paddle.logsumexp, x1)
-
-            x2 = fluid.layers.data(name='x2', shape=[2, 3], dtype="int")
-            self.assertRaises(Exception, paddle.logsumexp, x2)
-
-            x3 = fluid.layers.data(name='x3', shape=[3], dtype="float16")
-            self.assertRaises(Exception, paddle.logsumexp, x3)
-
-            self.assertRaises(AssertionError, paddle.logsumexp, None)
-
-
-class TestLogSumExpOp(unittest.TestCase):
-    def test_dygraph(self):
-        with fluid.dygraph.guard():
-            np_x = np.random.uniform(0.1, 1, [123]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            self.assertTrue(
-                np.allclose(
-                    paddle.logsumexp(x).numpy(), np.log(np.sum(np.exp(np_x)))))
-
-            np_x = np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            self.assertTrue(
-                np.allclose(
-                    paddle.logsumexp(
-                        x, dim=[1, 2]).numpy(),
-                    np.log(np.sum(np.exp(np_x), axis=(1, 2)))))
-
-            np_x = np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            self.assertTrue(
-                np.allclose(
-                    paddle.logsumexp(
-                        x, dim=[2]).numpy(),
-                    np.log(np.sum(np.exp(np_x), axis=(2)))))
-
-            np_x = np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            self.assertTrue(
-                np.allclose(
-                    paddle.logsumexp(
-                        x, keepdim=True).numpy(),
-                    np.log(np.sum(np.exp(np_x), keepdims=True))))
+        with paddle.static.program_guard(paddle.static.Program()):
+            self.assertRaises(TypeError, paddle.logsumexp, 1)
+            x1 = paddle.data(name='x1', shape=[120], dtype="int32")
+            self.assertRaises(TypeError, paddle.logsumexp, x1)
+
+
+class TestLogsumexpAPI(unittest.TestCase):
+    def setUp(self):
+        self.shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.shape).astype(np.float32)
+        self.place = paddle.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def api_case(self, axis=None, keepdim=False):
+        out_ref = ref_logsumexp(self.x, axis, keepdim)
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.shape)
+            out = paddle.logsumexp(x, axis, keepdim)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x}, fetch_list=[out])
+        self.assertTrue(np.allclose(res[0], out_ref))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x)
+        out = paddle.logsumexp(x, axis, keepdim)
+        self.assertTrue(np.allclose(out.numpy(), out_ref))
+        paddle.enable_static()
+
+    def test_api(self):
+        self.api_case()
+        self.api_case(2)
+        self.api_case([-1])
+        self.api_case([2, -3])
+        self.api_case((0, 1, -1))
+        self.api_case(keepdim=True)
+
+    def test_alias(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x)
+        out1 = paddle.logsumexp(x)
+        out2 = paddle.tensor.logsumexp(x)
+        out3 = paddle.tensor.math.logsumexp(x)
+        out_ref = ref_logsumexp(self.x)
+        for out in [out1, out2, out3]:
+            self.assertTrue(np.allclose(out.numpy(), out_ref))
+        paddle.enable_static()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f655e363e964893a7ab4c0a966856f873800ff6c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -0,0 +1,519 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import copy
+import math
+import numpy as np
+import unittest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.framework as framework
+import paddle.fluid.core as core
+
+
+def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss,
+                         var_list):
+    def is_better(current, best, m, n):
+        if m == 'min' and n == 'rel':
+            return current < best - best * threshold
+        elif m == 'min' and n == 'abs':
+            return current < best - threshold
+        elif m == 'max' and n == 'rel':
+            return current > best + best * threshold
+        else:  # mode == 'max' and epsilon_mode == 'abs':
+            return current > best + threshold
+
+    if var_list[2] > 0:
+        var_list[2] -= 1
+        return var_list[1]
+
+    if is_better(loss, var_list[0], m, n):
+        var_list[0] = loss
+        var_list[3] = 0
+    else:
+        var_list[3] += 1
+        if var_list[3] > patience:
+            var_list[2] = cooldown
+            var_list[3] = 0
+            new_lr = var_list[1] * decay_rate
+            var_list[1] = new_lr if var_list[1] - new_lr > 1e-8 else var_list[1]
+
+    return var_list[1]
+
+
+class TestReduceLROnPlateauDecay(object):
+    def test_ReduceLR(self):
+        # the decay rate must be less than 1.0
+        with self.assertRaises(ValueError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=2.0)
+        # the mode must be "min" or "max"
+        with self.assertRaises(ValueError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, mode="test")
+        # the threshold_mode must be "rel" or "abs"
+        with self.assertRaises(ValueError):
+            paddle.optimizer.ReduceLROnPlateau(
+                learning_rate=1.0, threshold_mode="test")
+        with self.assertRaises(TypeError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate="test")
+        with self.assertRaises(TypeError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate=0.5).step("test")
+
+        places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+
+        for place in places:
+            for m, n in zip(['min', 'max', 'min', 'max'],
+                            ['rel', 'rel', 'abs', 'abs']):
+                kwargs = {
+                    'learning_rate': 1.0,
+                    'mode': m,
+                    'factor': 0.5,
+                    'patience': 3,
+                    'threshold': 1e-4,
+                    'threshold_mode': n,
+                    'cooldown': 1,
+                    'min_lr': 0,
+                    'epsilon': 1e-8,
+                    'verbose': False,
+                }
+                paddle.enable_static()
+                self._test_static(place, kwargs)
+                paddle.disable_static(place)
+                self._test_dygraph(place, kwargs)
+                paddle.enable_static()
+
+    def _test_static(self, place, kwargs):
+        paddle.enable_static()
+
+        best = float("-10000") if kwargs['mode'] == "max" else float("10000")
+        current_lr = 1.0
+        cooldown_counter = 0
+        num_bad_epochs = 0
+        var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
+
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, start_prog):
+            x = fluid.layers.create_global_var(
+                [1], 1, 'float32', persistable=True)
+            paddle.increment(x)
+            loss = paddle.sin(x)
+            scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+            adam = paddle.optimizer.Adam(learning_rate=scheduler)
+            adam.minimize(loss)
+            lr_var = adam._global_learning_rate()
+            test_prog = main_prog.clone()
+
+        exe = paddle.static.Executor(place)
+        exe.run(start_prog)
+
+        for epoch in range(20):
+            for batch_id in range(1):
+                out, actual_lr = exe.run(main_prog,
+                                         fetch_list=[loss.name, lr_var.name])
+                expected_lr = reduce_lr_on_plateau(
+                    kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
+                    kwargs['patience'], kwargs['mode'],
+                    kwargs['threshold_mode'], out[0], var_list)
+
+            scheduler.step(out[0])
+            actual_lr = scheduler()
+            self.assertEqual(actual_lr, np.array(expected_lr))
+
+        for epoch in range(10):
+            for batch_id in range(1):
+                out, actual_lr = exe.run(test_prog,
+                                         fetch_list=[loss.name, lr_var.name])
+                expected_lr = reduce_lr_on_plateau(
+                    kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
+                    kwargs['patience'], kwargs['mode'],
+                    kwargs['threshold_mode'], out[0], var_list)
+            scheduler.step(out[0])
+            actual_lr = scheduler()
+            self.assertEqual(actual_lr, np.array(expected_lr))
+
+    def _test_dygraph(self, place, kwargs):
+        paddle.disable_static(place)
+
+        best = float("-10000") if kwargs['mode'] == "max" else float("10000")
+        current_lr = 1.0
+        cooldown_counter = 0
+        num_bad_epochs = 0
+        var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
+
+        linear = paddle.nn.Linear(10, 10)
+        scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+        adam = paddle.optimizer.Adam(
+            learning_rate=scheduler, parameters=linear.parameters())
+
+        for epoch in range(20):
+            for batch_id in range(1):
+                x = paddle.to_tensor(epoch).astype('float32')
+                loss = paddle.sin(x)
+                loss.backward()
+                adam.step()
+                adam.clear_grad()
+
+            scheduler.step(loss)
+            # get lr from paddle
+            current_lr = adam.get_lr()
+            # get lr form python
+            expected_lr = reduce_lr_on_plateau(
+                kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
+                kwargs['patience'], kwargs['mode'], kwargs['threshold_mode'],
+                loss, var_list)
+            self.assertEqual(current_lr, expected_lr)
+        state_dict = adam.state_dict()
+        scheduler1 = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+        adam1 = paddle.optimizer.Adam(
+            learning_rate=scheduler1, parameters=linear.parameters())
+        adam1.set_state_dict(state_dict)
+        self.assertEqual(scheduler.cooldown_counter,
+                         scheduler1.cooldown_counter)
+        self.assertEqual(scheduler.best.numpy()[0], scheduler1.best)
+        self.assertEqual(scheduler.num_bad_epochs, scheduler1.num_bad_epochs)
+        self.assertEqual(scheduler.last_epoch, scheduler1.last_epoch)
+        self.assertEqual(scheduler.last_lr, scheduler1.last_lr)
+
+
+def noam_lr(epoch_num, d_model, warmup_steps, learning_rate=1.0, verbose=False):
+    if epoch_num == 0:
+        a = 1
+    else:
+        a = math.pow(epoch_num, -0.5)
+    b = math.pow(warmup_steps, -1.5) * epoch_num
+    return learning_rate * math.pow(d_model, -0.5) * min(a, b)
+
+
+def lambda_lr(epoch_num, learning_rate, lr_lambda, verbose=False):
+    return learning_rate * lr_lambda(epoch_num)
+
+
+def piecewise_lr(epoch_num, boundaries, values, verbose=False):
+    assert len(boundaries) + 1 == len(values)
+    for i in range(len(boundaries)):
+        if epoch_num < boundaries[i]:
+            return values[i]
+    return values[len(values) - 1]
+
+
+def exponential_lr(epoch_num, learning_rate, gamma, verbose=False):
+    return learning_rate * gamma**epoch_num
+
+
+def natural_exp_lr(epoch_num, learning_rate, gamma, verbose=False):
+    return learning_rate * math.exp(-1 * gamma * epoch_num)
+
+
+def inverse_time_lr(epoch_num, learning_rate, gamma, verbose=False):
+    return learning_rate / (1 + gamma * epoch_num)
+
+
+def polynomial_lr(epoch_num,
+                  learning_rate,
+                  decay_steps,
+                  end_lr=0.0001,
+                  power=1.0,
+                  cycle=False,
+                  verbose=False):
+
+    if cycle:
+        div = math.ceil(epoch_num / float(decay_steps))
+        if epoch_num == 0:
+            div = 1
+        decay_steps = decay_steps * div
+    else:
+        epoch_num = min(epoch_num, decay_steps)
+    return (learning_rate - end_lr) * (
+        (1 - float(epoch_num) / float(decay_steps))**power) + end_lr
+
+    def get_lr(self):
+        if self.last_epoch == 0:
+            return self.base_lr
+        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
+            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(
+                math.pi / self.T_max)) / 2
+
+        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
+            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
+                self.last_lr - self.eta_min) + self.eta_min
+
+
+cosine_annealing_lr_current = None
+
+
+def cosine_annealing_lr(epoch_num,
+                        learning_rate,
+                        T_max,
+                        eta_min=0,
+                        verbose=False):
+    global cosine_annealing_lr_current
+    if epoch_num == 0:
+        cosine_annealing_lr_current = learning_rate
+    elif (epoch_num - 1 - T_max) % (2 * T_max) == 0:
+        cosine_annealing_lr_current = cosine_annealing_lr_current + (
+            learning_rate - eta_min) * (1 - math.cos(math.pi / float(T_max))
+                                        ) / 2
+    else:
+        cosine_annealing_lr_current = (1 + math.cos(
+            math.pi * epoch_num / float(T_max))) / (1 + math.cos(math.pi * (
+                epoch_num - 1) / float(T_max))) * (cosine_annealing_lr_current -
+                                                   eta_min) + eta_min
+    return cosine_annealing_lr_current
+
+
+def linear_warmup_lr(epoch_num,
+                     learning_rate,
+                     warmup_steps,
+                     start_lr,
+                     end_lr,
+                     verbose=False):
+    if epoch_num < warmup_steps:
+        return start_lr + (end_lr - start_lr) * (float(epoch_num) /
+                                                 float(warmup_steps))
+    else:
+        return learning_rate
+
+
+def multi_step_lr(epoch_num,
+                  learning_rate,
+                  milestones,
+                  gamma=0.1,
+                  verbose=False):
+    for i in range(len(milestones)):
+        if epoch_num < milestones[i]:
+            return learning_rate * (gamma**i)
+    return learning_rate * (gamma**len(milestones))
+
+
+def step_lr(epoch_num, learning_rate, step_size, gamma=0.1, verbose=False):
+    return learning_rate * math.pow(gamma, epoch_num // step_size)
+
+
+class TestLRScheduler(unittest.TestCase):
+    def _test_static(self, python_func, paddle_api, kwarg, place):
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, start_prog):
+            x = paddle.static.data(name='x', shape=[3, 4, 5])
+            y = paddle.static.data(name='y', shape=[3, 4, 5])
+            z = paddle.static.nn.fc(x, 100)
+            loss = paddle.mean(z)
+            scheduler = paddle_api(**kwarg)
+            adam = paddle.optimizer.Adam(learning_rate=scheduler)
+            adam.minimize(loss)
+            lr_var = adam._global_learning_rate()
+            test_prog = main_prog.clone()
+
+        num = 0
+        exe = paddle.static.Executor(place)
+        exe.run(start_prog)
+        for epoch in range(5):
+            for batch_id in range(2):
+                out = exe.run(
+                    main_prog,
+                    feed={
+                        'x': np.random.randn(3, 4, 5).astype('float32'),
+                        'y': np.random.randn(3, 4, 5).astype('float32')
+                    },
+                    fetch_list=lr_var.name)
+            self.assertEqual(out, np.array(python_func(num, **kwarg)))
+            scheduler.step()
+            num += 1
+
+        for epoch in range(5):
+            for batch_id in range(2):
+                out = exe.run(
+                    test_prog,
+                    feed={
+                        'x': np.random.randn(3, 4, 5).astype('float32'),
+                        'y': np.random.randn(3, 4, 5).astype('float32')
+                    },
+                    fetch_list=lr_var.name)
+            self.assertEqual(out, np.array(python_func(num, **kwarg)))
+            scheduler.step()
+            num += 1
+
+        if isinstance(place, paddle.CPUPlace):
+            compiled_train_prog = paddle.static.CompiledProgram(
+                main_prog).with_data_parallel(
+                    loss_name=loss.name, places=fluid.cpu_places(4))
+            for epoch in range(5):
+                python_result = python_func(num, **kwarg)
+                for batch_id in range(2):
+                    _ = exe.run(
+                        compiled_train_prog,
+                        feed={
+                            'x': np.random.randn(12, 4, 5).astype('float32'),
+                            'y': np.random.randn(12, 4, 5).astype('float32')
+                        },
+                        fetch_list=lr_var.name)
+                scopes = compiled_train_prog._executor.local_scopes()
+                out = np.array(scopes[0].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[1].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[2].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[3].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                scheduler.step()
+                num += 1
+
+            compiled_test_prog = paddle.static.CompiledProgram(
+                test_prog).with_data_parallel(
+                    loss_name=loss.name,
+                    share_vars_from=compiled_train_prog,
+                    places=fluid.cpu_places(4))
+            for epoch in range(5):
+                python_result = python_func(num, **kwarg)
+                for batch_id in range(2):
+                    _ = exe.run(
+                        compiled_test_prog,
+                        feed={
+                            'x': np.random.randn(12, 4, 5).astype('float32'),
+                            'y': np.random.randn(12, 4, 5).astype('float32')
+                        },
+                        fetch_list=lr_var.name)
+                scopes = compiled_test_prog._executor.local_scopes()
+                out = np.array(scopes[0].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[1].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[2].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[3].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                scheduler.step()
+                num += 1
+
+    def _test_dygraph(self, python_func, paddle_api, kwarg, place):
+        paddle.disable_static(place)
+        x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+        linear = paddle.nn.Linear(10, 10)
+        scheduler = paddle_api(**kwarg)
+        adam = paddle.optimizer.Adam(
+            learning_rate=scheduler, parameters=linear.parameters())
+        for epoch in range(20):
+            for batch_id in range(2):
+                x = paddle.to_tensor(x)
+                out = linear(x)
+                loss = paddle.reduce_mean(out)
+                loss.backward()
+                adam.step()
+                adam.clear_grad()
+            current_lr = adam.get_lr()
+            expected_lr = python_func(epoch, **kwarg)
+            if paddle_api.__name__ != "CosineAnnealingLR":
+                self.assertEqual(current_lr, expected_lr)
+                scheduler.step()
+            else:
+                self.assertAlmostEqual(current_lr, expected_lr)
+                scheduler.step(epoch + 1)
+
+    def test_scheduler(self):
+        with self.assertRaises(NotImplementedError):
+            paddle.optimizer.lr_scheduler._LRScheduler().step()
+        with self.assertRaises(TypeError):
+            paddle.optimizer.MultiStepLR(
+                learning_rate="test", milestones=[1, 2, 3])
+        with self.assertRaises(TypeError):
+            paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones='test')
+        with self.assertRaises(ValueError):
+            paddle.optimizer.MultiStepLR(
+                learning_rate=0.5, milestones=[3, 2, 1])
+        with self.assertRaises(ValueError):
+            paddle.optimizer.MultiStepLR(
+                learning_rate=0.5, milestones=[1, 2, 3], gamma=2)
+
+        func_api_kwargs = [(noam_lr, paddle.optimizer.NoamLR, {
+            "d_model": 0.01,
+            "warmup_steps": 100,
+            "verbose": False
+        }), (piecewise_lr, paddle.optimizer.PiecewiseLR, {
+            "boundaries": [3, 6, 9, 15, 20],
+            "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+            "verbose": False
+        }), (natural_exp_lr, paddle.optimizer.NaturalExpLR, {
+            "learning_rate": 0.5,
+            "gamma": 0.1,
+            "verbose": True
+        }), (inverse_time_lr, paddle.optimizer.InverseTimeLR, {
+            "learning_rate": 0.5,
+            "gamma": 0.1,
+            "verbose": False
+        }), (polynomial_lr, paddle.optimizer.PolynomialLR, {
+            "learning_rate": 0.5,
+            "decay_steps": 20,
+            "end_lr": 0,
+            "power": 1.0,
+            "cycle": False,
+            "verbose": True
+        }), (polynomial_lr, paddle.optimizer.PolynomialLR, {
+            "learning_rate": 0.5,
+            "decay_steps": 20,
+            "end_lr": 0,
+            "power": 1.0,
+            "cycle": True,
+            "verbose": False
+        }), (linear_warmup_lr, paddle.optimizer.LinearLrWarmup, {
+            'learning_rate': 0.5,
+            'warmup_steps': 20,
+            'start_lr': 0,
+            'end_lr': 0.5,
+            "verbose": True
+        }), (exponential_lr, paddle.optimizer.ExponentialLR, {
+            "learning_rate": 0.5,
+            "gamma": 0.9,
+            "verbose": False
+        }), (multi_step_lr, paddle.optimizer.MultiStepLR, {
+            "learning_rate": 0.5,
+            "milestones": [3, 6, 9, 15, 20],
+            "gamma": 0.8,
+            "verbose": True
+        }), (step_lr, paddle.optimizer.StepLR, {
+            "learning_rate": 0.5,
+            "step_size": 2,
+            "gamma": 0.8,
+            "verbose": False
+        }), (lambda_lr, paddle.optimizer.LambdaLR, {
+            "learning_rate": 0.5,
+            "lr_lambda": lambda x: 0.95**x,
+            "verbose": True
+        }), (cosine_annealing_lr, paddle.optimizer.CosineAnnealingLR, {
+            "learning_rate": 0.5,
+            "T_max": 10,
+            "verbose": False
+        })]
+
+        for python_func, paddle_api, kwarg in func_api_kwargs:
+            places = [paddle.CPUPlace()]
+            if core.is_compiled_with_cuda():
+                places.append(paddle.CUDAPlace(0))
+
+            for place in places:
+                paddle.enable_static()
+                #self._test_static(python_func, paddle_api, kwarg, place)
+                paddle.disable_static(place)
+                self._test_dygraph(python_func, paddle_api, kwarg, place)
+                paddle.enable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_manual_seed.py b/python/paddle/fluid/tests/unittests/test_manual_seed.py
index 747026622e4653491feecb852dde67a19b8ff666..a1d6eb915ce78a2b19092f20f8cfeed0d60979b2 100644
--- a/python/paddle/fluid/tests/unittests/test_manual_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_manual_seed.py
@@ -15,30 +15,33 @@
 from __future__ import print_function
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 from paddle.framework import manual_seed
 from paddle.fluid.framework import Program, default_main_program, default_startup_program
+import numpy as np
 
 
 class TestManualSeed(unittest.TestCase):
     def test_manual_seed(self):
-        local_program = Program()
-        local_main_prog = default_main_program()
-        local_start_prog = default_startup_program()
-
-        self.assertEqual(0, local_program.random_seed)
-        self.assertEqual(0, local_main_prog.random_seed)
-        self.assertEqual(0, local_start_prog.random_seed)
-
-        manual_seed(102)
-        global_program1 = Program()
-        global_program2 = Program()
-        global_main_prog = default_main_program()
-        global_start_prog = default_startup_program()
-        self.assertEqual(102, global_program1.random_seed)
-        self.assertEqual(102, global_program2.random_seed)
-        self.assertEqual(102, global_main_prog.random_seed)
-        self.assertEqual(102, global_start_prog.random_seed)
+        fluid.enable_dygraph()
+
+        gen = paddle.manual_seed(12312321111)
+        x = fluid.layers.gaussian_random([10], dtype="float32")
+        st1 = gen.get_state()
+        x1 = fluid.layers.gaussian_random([10], dtype="float32")
+        gen.set_state(st1)
+        x2 = fluid.layers.gaussian_random([10], dtype="float32")
+        gen.manual_seed(12312321111)
+        x3 = fluid.layers.gaussian_random([10], dtype="float32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if not fluid.core.is_compiled_with_cuda():
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_masked_select_op.py b/python/paddle/fluid/tests/unittests/test_masked_select_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..259a36e30d9a9c1852ff3800d5240ce7e7bb0e26
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_masked_select_op.py
@@ -0,0 +1,124 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle
+
+
+def np_masked_select(x, mask):
+    result = np.empty(shape=(0), dtype=x.dtype)
+    for ele, ma in zip(np.nditer(x), np.nditer(mask)):
+        if ma:
+            result = np.append(result, ele)
+    return result.flatten()
+
+
+class TestMaskedSelectOp(OpTest):
+    def setUp(self):
+        self.init()
+        self.op_type = "masked_select"
+        x = np.random.random(self.shape).astype("float64")
+        mask = np.array(np.random.randint(2, size=self.shape, dtype=bool))
+        out = np_masked_select(x, mask)
+        self.inputs = {'X': x, 'Mask': mask}
+        self.outputs = {'Y': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y')
+
+    def init(self):
+        self.shape = (50, 3)
+
+
+class TestMaskedSelectOp1(TestMaskedSelectOp):
+    def init(self):
+        self.shape = (6, 8, 9, 18)
+
+
+class TestMaskedSelectOp2(TestMaskedSelectOp):
+    def init(self):
+        self.shape = (168, )
+
+
+class TestMaskedSelectAPI(unittest.TestCase):
+    def test_imperative_mode(self):
+        paddle.disable_static()
+        shape = (88, 6, 8)
+        np_x = np.random.random(shape).astype('float32')
+        np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
+        x = paddle.to_tensor(np_x)
+        mask = paddle.to_tensor(np_mask)
+        out = paddle.masked_select(x, mask)
+        np_out = np_masked_select(np_x, np_mask)
+        self.assertEqual(np.allclose(out.numpy(), np_out), True)
+        paddle.enable_static()
+
+    def test_static_mode(self):
+        shape = [8, 9, 6]
+        x = paddle.data(shape=shape, dtype='float32', name='x')
+        mask = paddle.data(shape=shape, dtype='bool', name='mask')
+        np_x = np.random.random(shape).astype('float32')
+        np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
+
+        out = paddle.masked_select(x, mask)
+        np_out = np_masked_select(np_x, np_mask)
+
+        exe = paddle.static.Executor(place=paddle.CPUPlace())
+
+        res = exe.run(paddle.static.default_main_program(),
+                      feed={"x": np_x,
+                            "mask": np_mask},
+                      fetch_list=[out])
+        self.assertEqual(np.allclose(res, np_out), True)
+
+
+class TestMaskedSelectError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+
+            shape = [8, 9, 6]
+            x = paddle.data(shape=shape, dtype='float32', name='x')
+            mask = paddle.data(shape=shape, dtype='bool', name='mask')
+            mask_float = paddle.data(
+                shape=shape, dtype='float32', name='mask_float')
+            np_x = np.random.random(shape).astype('float32')
+            np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
+
+            def test_x_type():
+                paddle.masked_select(np_x, mask)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_mask_type():
+                paddle.masked_select(x, np_mask)
+
+            self.assertRaises(TypeError, test_mask_type)
+
+            def test_mask_dtype():
+                paddle.masked_select(x, mask_float)
+
+            self.assertRaises(TypeError, test_mask_dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
index f6eff22d6ce5f06d8853d6244f79b4b07b3fa4f5..00137f63e244a0e166047e89f9ef436da158ed16 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -189,15 +189,15 @@ class TestMathOpPatches(unittest.TestCase):
     @prog_scope()
     def test_integer_div(self):
         a = fluid.layers.data(name="a", shape=[1], dtype='int64')
-        b = a / 7
+        b = a / 2
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        a_np = numpy.array([3, 4, 10, 14, 9, 18]).astype('int64')
+        a_np = numpy.array([3, 4, 10, 14, 9, 18])
         b_np, = exe.run(fluid.default_main_program(),
                         feed={"a": a_np},
                         fetch_list=[b])
-
-        b_np_actual = (a_np / 7).astype('int64')
+        # for paddle2.0, use true_divide
+        b_np_actual = (a_np / 2.0)
         self.assertTrue(numpy.array_equal(b_np, b_np_actual))
 
     @prog_scope()
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index 803293be9b7d637875b56b443b04c246737ed2f8..9bb12d546550a821e8a133dd9c91d5d41a50b1b2 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import numpy as np
 import six
@@ -284,6 +285,223 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
             self.assertEqual((a != b).dtype, fluid.core.VarDesc.VarType.BOOL)
             self.assertTrue(np.array_equal((a != b).numpy(), a_np != b_np))
 
+    def test_tensor_patch_method(self):
+        paddle.disable_static()
+        x_np = np.random.uniform(-1, 1, [2, 3]).astype(self.dtype)
+        y_np = np.random.uniform(-1, 1, [2, 3]).astype(self.dtype)
+        z_np = np.random.uniform(-1, 1, [6, 9]).astype(self.dtype)
+
+        x = paddle.to_tensor(x_np)
+        y = paddle.to_tensor(y_np)
+        z = paddle.to_tensor(z_np)
+
+        a = paddle.to_tensor([[1, 1], [2, 2], [3, 3]])
+        b = paddle.to_tensor([[1, 1], [2, 2], [3, 3]])
+
+        # 1. Unary operation for Tensor
+        self.assertEqual(x.dim(), 2)
+        self.assertEqual(x.ndimension(), 2)
+        self.assertEqual(x.ndim, 2)
+        self.assertEqual(x.size(), [2, 3])
+        self.assertTrue(
+            np.array_equal(x.sigmoid().numpy(), fluid.layers.sigmoid(x).numpy(
+            )))
+        self.assertTrue(
+            np.array_equal(x.logsigmoid().numpy(),
+                           fluid.layers.logsigmoid(x).numpy()))
+        self.assertTrue(np.array_equal(x.exp().numpy(), paddle.exp(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.tanh().numpy(), paddle.tanh(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.atan().numpy(), paddle.atan(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.tanh_shrink().numpy(),
+                           fluid.layers.tanh_shrink(x).numpy()))
+        self.assertTrue(np.array_equal(x.abs().numpy(), paddle.abs(x).numpy()))
+        m = x.abs()
+        self.assertTrue(
+            np.array_equal(m.sqrt().numpy(), paddle.sqrt(m).numpy()))
+        self.assertTrue(
+            np.array_equal(m.rsqrt().numpy(), paddle.rsqrt(m).numpy()))
+        self.assertTrue(
+            np.array_equal(x.ceil().numpy(), paddle.ceil(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.floor().numpy(), paddle.floor(x).numpy()))
+        self.assertTrue(np.array_equal(x.cos().numpy(), paddle.cos(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.acos().numpy(), paddle.acos(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.asin().numpy(), paddle.asin(x).numpy()))
+        self.assertTrue(np.array_equal(x.sin().numpy(), paddle.sin(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.sinh().numpy(), paddle.sinh(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.cosh().numpy(), paddle.cosh(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.round().numpy(), paddle.round(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.reciprocal().numpy(), paddle.reciprocal(x).numpy(
+            )))
+        self.assertTrue(
+            np.array_equal(x.square().numpy(), paddle.square(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.softplus().numpy(),
+                           fluid.layers.softplus(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.softsign().numpy(),
+                           fluid.layers.softsign(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.rank().numpy(), paddle.rank(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x[0].t().numpy(), paddle.t(x[0]).numpy()))
+        m = paddle.to_tensor(np.random.uniform(1, 2, [3, 3]), 'float32')
+        m = m.matmul(m.t())
+        self.assertTrue(
+            np.array_equal(m.cholesky().numpy(), paddle.cholesky(m).numpy()))
+
+        self.assertTrue(
+            np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.isfinite().numpy(), paddle.isfinite(x).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.cast('int32').numpy(), paddle.cast(x, 'int32').numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.expand([3, 2, 3]).numpy(),
+                paddle.expand(x, [3, 2, 3]).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.tile([2, 2]).numpy(), paddle.tile(x, [2, 2]).numpy()))
+        self.assertTrue(
+            np.array_equal(x.flatten().numpy(), paddle.flatten(x).numpy()))
+        index = paddle.to_tensor([0, 1])
+        self.assertTrue(
+            np.array_equal(
+                x.gather(index).numpy(), paddle.gather(x, index).numpy()))
+        index = paddle.to_tensor([[0, 1], [1, 2]])
+        self.assertTrue(
+            np.array_equal(
+                x.gather_nd(index).numpy(), paddle.gather_nd(x, index).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.reverse([0, 1]).numpy(), paddle.reverse(x, [0, 1]).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                a.reshape([3, 2]).numpy(), paddle.reshape(a, [3, 2]).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.slice([0, 1], [0, 0], [1, 2]).numpy(),
+                paddle.slice(x, [0, 1], [0, 0], [1, 2]).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.split(2)[0].numpy(), paddle.split(x, 2)[0].numpy()))
+        m = paddle.to_tensor(
+            np.random.uniform(-1, 1, [1, 6, 1, 1]).astype(self.dtype))
+        self.assertTrue(
+            np.array_equal(
+                m.squeeze([]).numpy(), paddle.squeeze(m, []).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                m.squeeze([1, 2]).numpy(), paddle.squeeze(m, [1, 2]).numpy()))
+        m = paddle.to_tensor([2, 3, 3, 1, 5, 3], 'float32')
+        self.assertTrue(
+            np.array_equal(m.unique()[0].numpy(), paddle.unique(m)[0].numpy()))
+        self.assertTrue(
+            np.array_equal(m.unique_with_counts()[2],
+                           paddle.unique_with_counts(m)[2]))
+        self.assertTrue(np.array_equal(x.flip([0]), paddle.flip(x, [0])))
+        self.assertTrue(np.array_equal(x.unbind(0), paddle.unbind(x, 0)))
+        self.assertTrue(np.array_equal(x.roll(1), paddle.roll(x, 1)))
+        self.assertTrue(np.array_equal(x.cumsum(1), paddle.cumsum(x, 1)))
+        m = paddle.to_tensor(1)
+        self.assertTrue(np.array_equal(m.increment(), paddle.increment(m)))
+        m = x.abs()
+        self.assertTrue(np.array_equal(m.log(), paddle.log(m)))
+        self.assertTrue(np.array_equal(x.pow(2), paddle.pow(x, 2)))
+        self.assertTrue(np.array_equal(x.reciprocal(), paddle.reciprocal(x)))
+
+        # 2. Binary operation
+        self.assertTrue(
+            np.array_equal(
+                x.matmul(y, True, False).numpy(),
+                paddle.matmul(x, y, True, False).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.norm(
+                    p='fro', axis=[0, 1]).numpy(),
+                paddle.norm(
+                    x, p='fro', axis=[0, 1]).numpy()))
+        self.assertTrue(
+            np.array_equal(x.dist(y).numpy(), paddle.dist(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(x.cross(y).numpy(), paddle.cross(x, y).numpy()))
+        m = x.expand([2, 2, 3])
+        n = y.expand([2, 2, 3]).transpose([0, 2, 1])
+        self.assertTrue(
+            np.array_equal(m.bmm(n).numpy(), paddle.bmm(m, n).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.histogram(5, -1, 1).numpy(),
+                paddle.histogram(x, 5, -1, 1).numpy()))
+        self.assertTrue(
+            np.array_equal(x.equal(y).numpy(), paddle.equal(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.greater_equal(y).numpy(), paddle.greater_equal(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.greater_than(y).numpy(), paddle.greater_than(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.less_equal(y).numpy(), paddle.less_equal(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.less_than(y).numpy(), paddle.less_than(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.not_equal(y).numpy(), paddle.not_equal(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.equal_all(y).numpy(), paddle.equal_all(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.allclose(y).numpy(), paddle.allclose(x, y).numpy()))
+        m = x.expand([2, 2, 3])
+        self.assertTrue(
+            np.array_equal(
+                x.expand_as(m).numpy(), paddle.expand_as(x, m).numpy()))
+        index = paddle.to_tensor([2, 1, 0])
+        self.assertTrue(
+            np.array_equal(
+                a.scatter(index, b).numpy(),
+                paddle.scatter(a, index, b).numpy()))
+
+        # 3. Bool tensor operation
+        x = paddle.to_tensor([[True, False], [True, False]])
+        y = paddle.to_tensor([[False, False], [False, True]])
+        self.assertTrue(
+            np.array_equal(x.reduce_all().numpy(), paddle.reduce_all(x).numpy(
+            )))
+        self.assertTrue(
+            np.array_equal(x.reduce_any().numpy(), paddle.reduce_any(x).numpy(
+            )))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_and(y).numpy(), paddle.logical_and(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_not(y).numpy(), paddle.logical_not(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_or(y).numpy(), paddle.logical_or(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_xor(y).numpy(), paddle.logical_xor(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_and(y).numpy(), paddle.logical_and(x, y).numpy()))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..884139a23d51c95c79439b91d501dc935baeae36
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -0,0 +1,336 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, ))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size, ))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float64")
+    return Out
+
+
+class TestMatMulV2Op(OpTest):
+    """
+    case 1
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (100, )
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+    def setUp(self):
+        self.config()
+        self.op_type = "matmul_v2"
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        y = np.random.random(self.y_shape).astype(self.dtype)
+        result = reference_matmul(x, y, self.trans_x, self.trans_y)
+
+        self.inputs = {
+            'X': x,
+            'Y': y,
+        }
+        self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y}
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+
+class TestMatMuklOp2(TestMatMulV2Op):
+    """
+    case 2
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 3, 2, 100)
+        self.trans_x = False
+        self.trans_y = True
+        self.dtype = "float64"
+
+
+class TestMatMuklOp3(TestMatMulV2Op):
+    """
+    case 3
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp4(TestMatMulV2Op):
+    """
+    case 4
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 2, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp5(TestMatMulV2Op):
+    """
+    case 5
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 100, 2)
+        self.y_shape = (100, )
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp6(TestMatMulV2Op):
+    """
+    case 6
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (100, )
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp7(TestMatMulV2Op):
+    """
+    case 7
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 1, 100)
+        self.y_shape = (100, )
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp8(TestMatMulV2Op):
+    """
+    case 8
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 2, 100)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp9(TestMatMulV2Op):
+    """
+    case 9
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 1, 100)
+        self.y_shape = (2, 1, 2, 100)
+        self.trans_x = False
+        self.trans_y = True
+        self.dtype = "float64"
+
+
+class TestMatMuklOp10(TestMatMulV2Op):
+    """
+    case 10
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 2, 100)
+        self.y_shape = (1, 2, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp11(TestMatMulV2Op):
+    """
+    case 11
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 2, 100)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp12(TestMatMulV2Op):
+    """
+    case 12
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100, 2)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp13(TestMatMulV2Op):
+    """
+    case 13
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 100, 2)
+        self.y_shape = (2, 2, 100, 2)
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp14(TestMatMulV2Op):
+    """
+    case 14_1
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 1, 100, 2)
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp15(TestMatMulV2Op):
+    """
+    case 14_2
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 1, 2, 100)
+        self.y_shape = (1, 2, 2, 100, 1)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp16(TestMatMulV2Op):
+    """
+    case 16 : to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (1, 2, 2, 100, 1)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp17(TestMatMulV2Op):
+    """
+    case 17 : to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = (100)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMulV2API(unittest.TestCase):
+    def setUp(self):
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input_x = fluid.data(name="input_x", shape=[4, 3], dtype="float32")
+            input_y = fluid.data(name="input_y", shape=[3, 4], dtype="float32")
+
+            result = paddle.matmul(input_x, input_y)
+
+            x_np = np.random.random([4, 3]).astype("float32")
+            y_np = np.random.random([3, 4]).astype("float32")
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input_x": x_np,
+                                    "input_y": y_np},
+                              fetch_list=[result])
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_x = np.random.random([4, 3]).astype("float64")
+                input_y = np.random.random([3, 4]).astype("float64")
+                x = paddle.to_tensor(input_x)
+                y = paddle.to_tensor(input_y)
+                result = paddle.matmul(x, y)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_max_op.py b/python/paddle/fluid/tests/unittests/test_max_op.py
index e2bdaba91a68ff17d8d17724f8cbd5d8ad684d08..c9afc4bec66f2927a674ac15e807fe01f724c64f 100644
--- a/python/paddle/fluid/tests/unittests/test_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_max_op.py
@@ -32,7 +32,7 @@ class ApiMaxTest(unittest.TestCase):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            data = paddle.nn.data("data", shape=[10, 10], dtype="float32")
+            data = paddle.static.data("data", shape=[10, 10], dtype="float32")
             result_max = paddle.max(x=data, axis=1)
             exe = paddle.static.Executor(self.place)
             input_data = np.random.rand(10, 10).astype(np.float32)
@@ -41,7 +41,7 @@ class ApiMaxTest(unittest.TestCase):
 
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            data = paddle.nn.data("data", shape=[10, 10], dtype="int64")
+            data = paddle.static.data("data", shape=[10, 10], dtype="int64")
             result_max = paddle.max(x=data, axis=0)
             exe = paddle.static.Executor(self.place)
             input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
@@ -50,7 +50,7 @@ class ApiMaxTest(unittest.TestCase):
 
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            data = paddle.nn.data("data", shape=[10, 10], dtype="int64")
+            data = paddle.static.data("data", shape=[10, 10], dtype="int64")
             result_max = paddle.max(x=data, axis=(0, 1))
             exe = paddle.static.Executor(self.place)
             input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
@@ -71,8 +71,8 @@ class ApiMaxTest(unittest.TestCase):
         def test_axis_type():
             with paddle.static.program_guard(paddle.static.Program(),
                                              paddle.static.Program()):
-                data = paddle.nn.data("data", shape=[10, 10], dtype="int64")
-                axis = paddle.nn.data("axis", shape=[10, 10], dtype="int64")
+                data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+                axis = paddle.static.data("axis", shape=[10, 10], dtype="int64")
                 result_min = paddle.min(data, axis)
 
         self.assertRaises(TypeError, test_axis_type)
diff --git a/python/paddle/fluid/tests/unittests/test_maximum_op.py b/python/paddle/fluid/tests/unittests/test_maximum_op.py
index bed2b57ec596978548e745a9afb2d4225ae4e5a8..5645597007a00cac9c75ec1ae90bc00a5bc75f22 100644
--- a/python/paddle/fluid/tests/unittests/test_maximum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maximum_op.py
@@ -36,8 +36,8 @@ class ApiMaximumTest(unittest.TestCase):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            data_x = paddle.nn.data("x", shape=[10, 15], dtype="float32")
-            data_y = paddle.nn.data("y", shape=[10, 15], dtype="float32")
+            data_x = paddle.static.data("x", shape=[10, 15], dtype="float32")
+            data_y = paddle.static.data("y", shape=[10, 15], dtype="float32")
             result_max = paddle.maximum(data_x, data_y)
             exe = paddle.static.Executor(self.place)
             res, = exe.run(feed={"x": self.input_x,
@@ -48,8 +48,8 @@ class ApiMaximumTest(unittest.TestCase):
 
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            data_x = paddle.nn.data("x", shape=[10, 15], dtype="float32")
-            data_z = paddle.nn.data("z", shape=[15], dtype="float32")
+            data_x = paddle.static.data("x", shape=[10, 15], dtype="float32")
+            data_z = paddle.static.data("z", shape=[15], dtype="float32")
             result_max = paddle.maximum(data_x, data_z, axis=1)
             exe = paddle.static.Executor(self.place)
             res, = exe.run(feed={"x": self.input_x,
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index a2befb4a29a0f39d6d51c1a869c129889ec18015..29e79b096cf790858e8e07aedc5c6b76881e8f82 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -22,6 +22,8 @@ import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
+np.random.seed(10)
+
 
 class TestMeanOp(OpTest):
     def setUp(self):
@@ -74,10 +76,105 @@ class TestFP16MeanOp(TestMeanOp):
                 place, ['X'], 'Out', max_relative_error=0.8)
 
 
+def ref_reduce_mean(x, axis=None, keepdim=False, reduce_all=False):
+    if isinstance(axis, list):
+        axis = tuple(axis)
+    if reduce_all:
+        axis = None
+    return np.mean(x, axis=axis, keepdims=keepdim)
+
+
+class TestReduceMeanOp(OpTest):
+    def setUp(self):
+        self.op_type = 'reduce_mean'
+        self.dtype = 'float64'
+        self.shape = [2, 3, 4, 5]
+        self.axis = [0]
+        self.keepdim = False
+        self.reduce_all = False
+        self.set_attrs()
+
+        np.random.seed(10)
+        x_np = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out_np = ref_reduce_mean(x_np, self.axis, self.keepdim, self.reduce_all)
+        self.inputs = {'X': x_np}
+        self.outputs = {'Out': out_np}
+        self.attrs = {
+            'dim': self.axis,
+            'keep_dim': self.keepdim,
+            'reduce_all': self.reduce_all
+        }
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], ['Out'])
+
+
+class TestReduceMeanOpDefaultAttrs(TestReduceMeanOp):
+    def setUp(self):
+        self.op_type = 'reduce_mean'
+        self.dtype = 'float64'
+        self.shape = [2, 3, 4, 5]
+
+        x_np = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out_np = np.mean(x_np, axis=0)
+        self.inputs = {'X': x_np}
+        self.outputs = {'Out': out_np}
+
+
+class TestReduceMeanOpFloat32(TestReduceMeanOp):
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+
+class TestReduceMeanOpShape1D(TestReduceMeanOp):
+    def set_attrs(self):
+        self.shape = [100]
+
+
+class TestReduceMeanOpShape6D(TestReduceMeanOp):
+    def set_attrs(self):
+        self.shape = [2, 3, 4, 5, 6, 7]
+
+
+class TestReduceMeanOpAxisAll(TestReduceMeanOp):
+    def set_attrs(self):
+        self.axis = [0, 1, 2, 3]
+
+
+class TestReduceMeanOpAxisTuple(TestReduceMeanOp):
+    def set_attrs(self):
+        self.axis = (0, 1, 2)
+
+
+class TestReduceMeanOpAxisNegative(TestReduceMeanOp):
+    def set_attrs(self):
+        self.axis = [-2, -1]
+
+
+class TestReduceMeanOpKeepdimTrue1(TestReduceMeanOp):
+    def set_attrs(self):
+        self.keepdim = True
+
+
+class TestReduceMeanOpKeepdimTrue2(TestReduceMeanOp):
+    def set_attrs(self):
+        self.axis = [0, 1, 2, 3]
+        self.keepdim = True
+
+
+class TestReduceMeanOpReduceAllTrue(TestReduceMeanOp):
+    def set_attrs(self):
+        self.reduce_all = True
+
+
 class TestMeanAPI(unittest.TestCase):
-    """
-    test paddle.tensor.stat.mean
-    """
+    # test paddle.tensor.stat.mean
 
     def setUp(self):
         self.x_shape = [2, 3, 4, 5]
@@ -101,7 +198,7 @@ class TestMeanAPI(unittest.TestCase):
                           fetch_list=[out1, out2, out3, out4, out5])
         out_ref = np.mean(self.x)
         for out in res:
-            self.assertEqual(np.allclose(out, out_ref), True)
+            self.assertEqual(np.allclose(out, out_ref, rtol=1e-04), True)
 
     def test_api_dygraph(self):
         paddle.disable_static(self.place)
@@ -114,7 +211,9 @@ class TestMeanAPI(unittest.TestCase):
                 if len(axis) == 0:
                     axis = None
             out_ref = np.mean(x, axis, keepdims=keepdim)
-            self.assertEqual(np.allclose(out.numpy(), out_ref), True)
+            self.assertEqual(
+                np.allclose(
+                    out.numpy(), out_ref, rtol=1e-04), True)
 
         test_case(self.x)
         test_case(self.x, [])
@@ -126,10 +225,31 @@ class TestMeanAPI(unittest.TestCase):
         test_case(self.x, [0, 1, 2, 3])
         paddle.enable_static()
 
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data("x", shape=[10, 10], dtype="float32")
+            out = fluid.layers.reduce_mean(input=x, dim=1)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            x_np = np.random.rand(10, 10).astype(np.float32)
+            res = exe.run(feed={"x": x_np}, fetch_list=[out])
+        self.assertEqual(np.allclose(res[0], np.mean(x_np, axis=1)), True)
+
+        with fluid.dygraph.guard():
+            x_np = np.random.rand(10, 10).astype(np.float32)
+            x = fluid.dygraph.to_variable(x_np)
+            out = fluid.layers.reduce_mean(input=x, dim=1)
+        self.assertEqual(np.allclose(out.numpy(), np.mean(x_np, axis=1)), True)
+
     def test_errors(self):
+        paddle.disable_static()
+        x = np.random.uniform(-1, 1, [10, 12]).astype('float32')
+        x = paddle.to_tensor(x)
+        self.assertRaises(Exception, paddle.mean, x, -3)
+        self.assertRaises(Exception, paddle.mean, x, 2)
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [10, 12], 'int8')
+            x = paddle.data('X', [10, 12], 'int32')
             self.assertRaises(TypeError, paddle.mean, x)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_metrics.py b/python/paddle/fluid/tests/unittests/test_metrics.py
deleted file mode 100644
index ec27884cae2b0462951f6597b1b83e58d1c8af5d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_metrics.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle.fluid as fluid
-from paddle.fluid.framework import Program, program_guard
-
-
-class TestMetricsDetectionMap(unittest.TestCase):
-    def test_detection_map(self):
-        program = fluid.Program()
-        with program_guard(program):
-            detect_res = fluid.layers.data(
-                name='detect_res',
-                shape=[10, 6],
-                append_batch_size=False,
-                dtype='float32')
-            label = fluid.layers.data(
-                name='label',
-                shape=[10, 1],
-                append_batch_size=False,
-                dtype='float32')
-            box = fluid.layers.data(
-                name='bbox',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            map_eval = fluid.metrics.DetectionMAP(
-                detect_res, label, box, class_num=21)
-            cur_map, accm_map = map_eval.get_map_var()
-            self.assertIsNotNone(cur_map)
-            self.assertIsNotNone(accm_map)
-        print(str(program))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_min_op.py b/python/paddle/fluid/tests/unittests/test_min_op.py
index e8bfe55f32a122ac9259b68d6a888f93757a76be..b9eff05c5ea9fb585421b6f99bf55b3bb95bf9ff 100644
--- a/python/paddle/fluid/tests/unittests/test_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_min_op.py
@@ -32,7 +32,7 @@ class ApiMinTest(unittest.TestCase):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            data = paddle.nn.data("data", shape=[10, 10], dtype="float32")
+            data = paddle.static.data("data", shape=[10, 10], dtype="float32")
             result_min = paddle.min(x=data, axis=1)
             exe = paddle.static.Executor(self.place)
             input_data = np.random.rand(10, 10).astype(np.float32)
@@ -41,7 +41,7 @@ class ApiMinTest(unittest.TestCase):
 
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            data = paddle.nn.data("data", shape=[10, 10], dtype="int64")
+            data = paddle.static.data("data", shape=[10, 10], dtype="int64")
             result_min = paddle.min(x=data, axis=0)
             exe = paddle.static.Executor(self.place)
             input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
@@ -50,7 +50,7 @@ class ApiMinTest(unittest.TestCase):
 
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            data = paddle.nn.data("data", shape=[10, 10], dtype="int64")
+            data = paddle.static.data("data", shape=[10, 10], dtype="int64")
             result_min = paddle.min(x=data, axis=(0, 1))
             exe = paddle.static.Executor(self.place)
             input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
@@ -71,8 +71,8 @@ class ApiMinTest(unittest.TestCase):
         def test_axis_type():
             with paddle.static.program_guard(paddle.static.Program(),
                                              paddle.static.Program()):
-                data = paddle.nn.data("data", shape=[10, 10], dtype="int64")
-                axis = paddle.nn.data("axis", shape=[10, 10], dtype="int64")
+                data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+                axis = paddle.static.data("axis", shape=[10, 10], dtype="int64")
                 result_min = paddle.min(data, axis)
 
         self.assertRaises(TypeError, test_axis_type)
diff --git a/python/paddle/fluid/tests/unittests/test_minimum_op.py b/python/paddle/fluid/tests/unittests/test_minimum_op.py
index 550580407acf265e0f0bfae12961466c2d57e48a..4c08b7386ca2c5da04c0a289872dacf68a2ea040 100644
--- a/python/paddle/fluid/tests/unittests/test_minimum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minimum_op.py
@@ -36,8 +36,8 @@ class ApiMinimumTest(unittest.TestCase):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            data_x = paddle.nn.data("x", shape=[10, 15], dtype="float32")
-            data_y = paddle.nn.data("y", shape=[10, 15], dtype="float32")
+            data_x = paddle.static.data("x", shape=[10, 15], dtype="float32")
+            data_y = paddle.static.data("y", shape=[10, 15], dtype="float32")
             result_min = paddle.minimum(data_x, data_y)
             exe = paddle.static.Executor(self.place)
             res, = exe.run(feed={"x": self.input_x,
@@ -48,8 +48,8 @@ class ApiMinimumTest(unittest.TestCase):
 
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            data_x = paddle.nn.data("x", shape=[10, 15], dtype="float32")
-            data_z = paddle.nn.data("z", shape=[15], dtype="float32")
+            data_x = paddle.static.data("x", shape=[10, 15], dtype="float32")
+            data_z = paddle.static.data("z", shape=[15], dtype="float32")
             result_min = paddle.minimum(data_x, data_z, axis=1)
             exe = paddle.static.Executor(self.place)
             res, = exe.run(feed={"x": self.input_x,
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index 8ca06aa952184daec6be59a09330c8f16f6ee1d6..5f223de1954f7b401ac031265cca8c2e661c7392 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -175,5 +175,57 @@ class TestFP16MulOp2(TestMulOp2):
                 no_grad_set=set('Y'))
 
 
+@unittest.skipIf(not core.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp1(TestMulOp):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = core.XPUPlace(0)
+        self.check_output_with_place(place, atol=1e-1)
+
+    def test_check_grad_normal(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.5)
+
+    def test_check_grad_ingore_x(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+@unittest.skipIf(not core.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp2(TestMulOp2):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = core.XPUPlace(0)
+        self.check_output_with_place(place, atol=2e-1)
+
+    def test_check_grad_normal(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.9)
+
+    def test_check_grad_ingore_x(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiply.py b/python/paddle/fluid/tests/unittests/test_multiply.py
old mode 100644
new mode 100755
index f7f6e1f1aac678a00617e0693847b4346604a1ab..dbf167617a24f36a36aff52d996a50ca3ebb6672
--- a/python/paddle/fluid/tests/unittests/test_multiply.py
+++ b/python/paddle/fluid/tests/unittests/test_multiply.py
@@ -26,8 +26,10 @@ class TestMultiplyAPI(unittest.TestCase):
 
     def __run_static_graph_case(self, x_data, y_data, axis=-1):
         with program_guard(Program(), Program()):
-            x = paddle.nn.data(name='x', shape=x_data.shape, dtype=x_data.dtype)
-            y = paddle.nn.data(name='y', shape=y_data.shape, dtype=y_data.dtype)
+            x = paddle.static.data(
+                name='x', shape=x_data.shape, dtype=x_data.dtype)
+            y = paddle.static.data(
+                name='y', shape=y_data.shape, dtype=y_data.dtype)
             res = tensor.multiply(x, y, axis=axis)
 
             place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -42,8 +44,8 @@ class TestMultiplyAPI(unittest.TestCase):
 
     def __run_dynamic_graph_case(self, x_data, y_data, axis=-1):
         paddle.disable_static()
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
         res = paddle.multiply(x, y, axis=axis)
         return res.numpy()
 
@@ -109,14 +111,14 @@ class TestMultiplyError(unittest.TestCase):
         # test static computation graph: dtype can not be int8
         paddle.enable_static()
         with program_guard(Program(), Program()):
-            x = paddle.nn.data(name='x', shape=[100], dtype=np.int8)
-            y = paddle.nn.data(name='y', shape=[100], dtype=np.int8)
+            x = paddle.static.data(name='x', shape=[100], dtype=np.int8)
+            y = paddle.static.data(name='y', shape=[100], dtype=np.int8)
             self.assertRaises(TypeError, tensor.multiply, x, y)
 
         # test static computation graph: inputs must be broadcastable 
         with program_guard(Program(), Program()):
-            x = paddle.nn.data(name='x', shape=[20, 50], dtype=np.float64)
-            y = paddle.nn.data(name='y', shape=[20], dtype=np.float64)
+            x = paddle.static.data(name='x', shape=[20, 50], dtype=np.float64)
+            y = paddle.static.data(name='y', shape=[20], dtype=np.float64)
             self.assertRaises(fluid.core.EnforceNotMet, tensor.multiply, x, y)
 
         np.random.seed(7)
@@ -124,17 +126,31 @@ class TestMultiplyError(unittest.TestCase):
         paddle.disable_static()
         x_data = np.random.randn(200).astype(np.int8)
         y_data = np.random.randn(200).astype(np.int8)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
         self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
 
         # test dynamic computation graph: inputs must be broadcastable
         x_data = np.random.rand(200, 5)
         y_data = np.random.rand(200)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
         self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
 
+        # test dynamic computation graph: inputs must be broadcastable(python)
+        x_data = np.random.rand(200, 5)
+        y_data = np.random.rand(200)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
+
+        # test dynamic computation graph: dtype must be same
+        x_data = np.random.randn(200).astype(np.int64)
+        y_data = np.random.randn(200).astype(np.float64)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(TypeError, paddle.multiply, x, y)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e2f9562b453b7faf40d4fc421dcea4967724025
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.io import TensorDataset, DataLoader
+from paddle.fluid.dygraph.base import to_variable
+
+
+class TestTensorDataset(unittest.TestCase):
+    def run_main(self, num_workers, places):
+        fluid.default_startup_program().random_seed = 1
+        fluid.default_main_program().random_seed = 1
+        place = fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([16, 3, 4]).astype('float32')
+            input = to_variable(input_np)
+            label_np = np.random.random([16, 1]).astype('int32')
+            label = to_variable(label_np)
+
+            dataset = TensorDataset([input, label])
+            assert len(dataset) == 16
+            dataloader = DataLoader(
+                dataset,
+                places=place,
+                num_workers=num_workers,
+                batch_size=1,
+                drop_last=True)
+
+            for i, (input, label) in enumerate(dataloader()):
+                assert len(input) == 1
+                assert len(label) == 1
+                assert input.shape == [1, 3, 4]
+                assert label.shape == [1, 1]
+                assert isinstance(input, paddle.Tensor)
+                assert isinstance(label, paddle.Tensor)
+                assert np.allclose(input.numpy(), input_np[i])
+                assert np.allclose(label.numpy(), label_np[i])
+
+    def test_main(self):
+        for p in [fluid.CPUPlace(), fluid.CUDAPlace(0)]:
+            for num_workers in [0, 2]:
+                ret = self.run_main(num_workers=num_workers, places=p)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..19da09a463f3cc6224a22eb90278abae9ec59b91
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
@@ -0,0 +1,556 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.nn as nn
+import paddle
+
+
+def nearest_neighbor_interp_np(X,
+                               out_h,
+                               out_w,
+                               out_size=None,
+                               actual_shape=None,
+                               align_corners=True,
+                               data_layout='NCHW'):
+    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        X = np.transpose(X, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    n, c, in_h, in_w = X.shape
+
+    ratio_h = ratio_w = 0.0
+    if (out_h > 1):
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+    if (out_w > 1):
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((n, c, out_h, out_w))
+
+    if align_corners:
+        for i in range(out_h):
+            in_i = int(ratio_h * i + 0.5)
+            for j in range(out_w):
+                in_j = int(ratio_w * j + 0.5)
+                out[:, :, i, j] = X[:, :, in_i, in_j]
+    else:
+        for i in range(out_h):
+            in_i = int(ratio_h * i)
+            for j in range(out_w):
+                in_j = int(ratio_w * j)
+                out[:, :, i, j] = X[:, :, in_i, in_j]
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+
+    return out.astype(X.dtype)
+
+
+class TestNearestInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "nearest_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = nearest_neighbor_interp_np(
+            input_np, out_h, out_w, self.out_size, self.actual_shape,
+            self.align_corners, self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'data_layout': self.data_layout
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 4, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = np.array([11, 11]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([65, 129]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpSame(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 4, 4, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 8]).astype("int32")
+        self.align_corners = True
+        self.data_layout = "NHWC"
+
+
+class TestNearestInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "nearest_interp_v2"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(self.input_shape[2] * scale_h)
+            out_w = int(self.input_shape[3] * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+                                               self.out_size, self.actual_shape,
+                                               self.align_corners)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 80
+        self.out_w = 40
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.scale = 0.
+        self.out_size = np.array([6, 15]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestInterpWithoutCorners(TestNearestInterpOp):
+    def set_align_corners(self):
+        self.align_corners = False
+
+
+class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 7, 5]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 2.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 5, 7]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 1.5
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 7, 5]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [2.0, 3.0]
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestInterpOp_attr_tensor(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "nearest_interp_v2"
+        self.shape_by_1Dtensor = False
+        self.scale_by_1Dtensor = False
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+        }
+
+        input_np = np.random.random(self.input_shape).astype("float64")
+        self.inputs = {'X': input_np}
+
+        if self.scale_by_1Dtensor:
+            self.inputs['Scale'] = np.array([self.scale]).astype("float64")
+        elif self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(self.input_shape[2] * scale_h)
+            out_w = int(self.input_shape[3] * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        if self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.out_size
+        elif self.out_size is not None:
+            size_tensor = []
+            for index, ele in enumerate(self.out_size):
+                size_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+            self.inputs['SizeTensor'] = size_tensor
+
+        self.attrs['out_h'] = self.out_h
+        self.attrs['out_w'] = self.out_w
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+                                               self.out_size, self.actual_shape,
+                                               self.align_corners)
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 5, 4, 4]
+        self.out_h = 3
+        self.out_w = 3
+        self.scale = 0.
+        self.out_size = [3, 3]
+        self.align_corners = True
+
+
+# out_size is a tensor list
+class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = [8, 12]
+        self.align_corners = True
+
+
+# out_size is a 1-D tensor
+class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+        self.shape_by_1Dtensor = True
+
+
+# scale is a 1-D tensor
+class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 2.0
+        self.out_size = None
+        self.align_corners = True
+        self.scale_by_1Dtensor = True
+
+
+class TestNearestAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+        y = fluid.data(name="y", shape=[2, 6, 6, 3], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
+        scale_tensor = fluid.data(
+            name="scale_tensor", shape=[1], dtype="float32")
+
+        out1 = fluid.layers.resize_nearest(
+            y, out_shape=[12, 12], data_format='NHWC')
+        out2 = fluid.layers.resize_nearest(x, out_shape=[12, dim])
+        out3 = fluid.layers.resize_nearest(x, out_shape=shape_tensor)
+        out4 = fluid.layers.resize_nearest(
+            x, out_shape=[4, 4], actual_shape=actual_size)
+        out5 = fluid.layers.resize_nearest(x, scale=scale_tensor)
+
+        x_data = np.random.random((2, 3, 6, 6)).astype("float32")
+        dim_data = np.array([12]).astype("int32")
+        shape_data = np.array([12, 12]).astype("int32")
+        actual_size_data = np.array([12, 12]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "x": x_data,
+                              "y": np.transpose(x_data, (0, 2, 3, 1)),
+                              "dim": dim_data,
+                              "shape_tensor": shape_data,
+                              "actual_size": actual_size_data,
+                              "scale_tensor": scale_data
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5],
+                          return_numpy=True)
+
+        expect_res = nearest_neighbor_interp_np(
+            x_data, out_h=12, out_w=12, align_corners=True)
+        self.assertTrue(
+            np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1))))
+        for i in range(len(results) - 1):
+            self.assertTrue(np.allclose(results[i + 1], expect_res))
+
+
+class TestUpsampleNearest2dInterpOpAPI2_0(unittest.TestCase):
+    def test_case(self):
+
+        # dygraph
+        x_data = np.random.random((1, 3, 6, 6)).astype("float32")
+        upsample = paddle.nn.UpsamplingNearest2d(scale_factor=[2, 2])
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(x_data)
+            interp = upsample(x)
+            expect = nearest_neighbor_interp_np(
+                x_data, out_h=12, out_w=12, align_corners=False)
+            self.assertTrue(np.allclose(interp.numpy(), expect))
+
+
+class TestNearestInterpException(unittest.TestCase):
+    def test_exception(self):
+        input = fluid.data(name="input", shape=[1, 3, 6, 6], dtype="float32")
+
+        def attr_data_format():
+            # for 4-D input, data_format can only be NCHW or NHWC
+            out = fluid.layers.resize_nearest(
+                input, out_shape=[4, 8], data_format='NDHWC')
+
+        def attr_scale_type():
+            out = fluid.layers.resize_nearest(input, scale='scale')
+
+        def attr_scale_value():
+            out = fluid.layers.resize_nearest(input, scale=-0.3)
+
+        self.assertRaises(ValueError, attr_data_format)
+        self.assertRaises(TypeError, attr_scale_type)
+        self.assertRaises(ValueError, attr_scale_value)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py b/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..339f689998f817054611bd85b11945b61d1f649b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
@@ -0,0 +1,207 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.nn.functional as functional
+import paddle.fluid.framework as framework
+from paddle.fluid.framework import Program, program_guard
+
+
+class TestOneHotOp(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestOneHotOp_attr(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, 0, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestOneHotOp_default_dtype(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestOneHotOp_default_dtype_attr(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, 0, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestOneHotOp_exception(unittest.TestCase):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        self.depth = 10
+        self.place = core.CPUPlace()
+        self.dimension = 12
+        self.x = core.LoDTensor()
+        x_lod = [[4, 1, 3, 3]]
+        data = [np.random.randint(11, 20) for i in range(sum(x_lod[0]))]
+        data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
+        self.x.set(data, self.place)
+        self.x.set_recursive_sequence_lengths(x_lod)
+
+    def test_check_output(self):
+        program = Program()
+        with program_guard(program):
+            x = fluid.layers.data(
+                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
+            block = program.current_block()
+            one_hot_out = block.create_var(
+                name="one_hot_out",
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                dtype='float32')
+            block.append_op(
+                type='one_hot',
+                inputs={'X': x},
+                attrs={'depth': self.depth},
+                outputs={'Out': one_hot_out})
+            exe = fluid.Executor(self.place)
+
+            def run():
+                exe.run(feed={'x': self.x},
+                        fetch_list=[one_hot_out],
+                        return_numpy=False)
+
+            self.assertRaises(core.EnforceNotMet, run)
+
+
+class TestOneHotOpApi(unittest.TestCase):
+    def test_api(self):
+        num_classes = 10
+        self._run(num_classes)
+
+    def test_api_with_depthTensor(self):
+        num_classes = fluid.layers.assign(input=np.array([10], dtype=np.int32))
+        self._run(num_classes)
+
+    def test_api_with_dygraph(self):
+        num_classes = 10
+        label = np.array(
+            [np.random.randint(0, num_classes - 1)
+             for i in range(6)]).reshape([6, 1])
+        with fluid.dygraph.guard():
+            one_hot_label = functional.one_hot(
+                x=fluid.dygraph.to_variable(label), num_classes=num_classes)
+
+    def _run(self, num_classes):
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        one_hot_label = functional.one_hot(x=label, num_classes=num_classes)
+
+        place = fluid.CPUPlace()
+        label_data = np.array([np.random.randint(0, 10 - 1)
+                               for i in range(6)]).reshape([6, 1])
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        ret = exe.run(feed={'label': label_data, },
+                      fetch_list=[one_hot_label],
+                      return_numpy=False)
+
+
+class BadInputTestOnehotV2(unittest.TestCase):
+    def test_error(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def test_bad_x():
+                label = fluid.layers.data(
+                    name="label",
+                    shape=[4],
+                    append_batch_size=False,
+                    dtype="float32")
+                one_hot_label = functional.one_hot(x=label, num_classes=4)
+
+            self.assertRaises(TypeError, test_bad_x)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
index 4f60f3e39a57365163cb3f5f3e061e53f8fd654b..0ebe769fb9bce1aee8412ccebc216c2c85e97775 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
@@ -54,9 +54,11 @@ def create_test_case(margin, reduction):
                 margin=margin,
                 reduction=reduction)
             with program_guard(Program(), Program()):
-                x = paddle.nn.data(name="x", shape=[10, 10], dtype="float64")
-                y = paddle.nn.data(name="y", shape=[10, 10], dtype="float64")
-                label = paddle.nn.data(
+                x = paddle.static.data(
+                    name="x", shape=[10, 10], dtype="float64")
+                y = paddle.static.data(
+                    name="y", shape=[10, 10], dtype="float64")
+                label = paddle.static.data(
                     name="label", shape=[10, 10], dtype="float64")
                 result = paddle.nn.functional.margin_ranking_loss(
                     x, y, label, margin, reduction)
@@ -78,9 +80,11 @@ def create_test_case(margin, reduction):
                 margin=margin,
                 reduction=reduction)
             with program_guard(Program(), Program()):
-                x = paddle.nn.data(name="x", shape=[10, 10], dtype="float64")
-                y = paddle.nn.data(name="y", shape=[10, 10], dtype="float64")
-                label = paddle.nn.data(
+                x = paddle.static.data(
+                    name="x", shape=[10, 10], dtype="float64")
+                y = paddle.static.data(
+                    name="y", shape=[10, 10], dtype="float64")
+                label = paddle.static.data(
                     name="label", shape=[10, 10], dtype="float64")
                 margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
                     margin=margin, reduction=reduction)
@@ -173,6 +177,16 @@ class MarginRakingLossError(unittest.TestCase):
 
         self.assertRaises(ValueError, test_margin_value_error)
 
+        def test_functional_margin_value_error():
+            x = paddle.static.data(name="x", shape=[10, 10], dtype="float64")
+            y = paddle.static.data(name="y", shape=[10, 10], dtype="float64")
+            label = paddle.static.data(
+                name="label", shape=[10, 10], dtype="float64")
+            result = paddle.nn.functional.margin_ranking_loss(
+                x, y, label, margin=0.1, reduction="reduction_mean")
+
+        self.assertRaises(ValueError, test_functional_margin_value_error)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
index 451ab696fd6a245cd81219f818a182807e785f31..d52a1f5d5b16ca7e0d58230a1a17624e5bff0b02 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
@@ -45,7 +45,7 @@ class TestNNSigmoidAPI(unittest.TestCase):
         main_program = paddle.static.Program()
         mysigmoid = nn.Sigmoid(name="api_sigmoid")
         with paddle.static.program_guard(main_program):
-            x = paddle.nn.data(name='x', shape=self.x_shape)
+            x = paddle.static.data(name='x', shape=self.x_shape)
             x.stop_gradient = False
             y = mysigmoid(x)
             fluid.backward.append_backward(paddle.mean(y))
@@ -86,7 +86,7 @@ class TestNNFunctionalSigmoidAPI(unittest.TestCase):
         paddle.enable_static()
         main_program = paddle.static.Program()
         with paddle.static.program_guard(main_program):
-            x = paddle.nn.data(name='x', shape=self.x_shape)
+            x = paddle.static.data(name='x', shape=self.x_shape)
             y = functional.sigmoid(x, name="api_sigmoid")
         exe = paddle.static.Executor(fluid.CPUPlace())
         out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index e6b7a3e7603f53d78052d5de309d6ed7d84c4660..c047cf6ddff78641b918de75a284574175bb3bca 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -22,17 +22,48 @@ import paddle.fluid as fluid
 
 
 def p_norm(x, axis, porder, keepdims=False):
-    if axis is None: axis = -1
-    xp = np.power(np.abs(x), porder)
-    s = np.sum(xp, axis=axis, keepdims=keepdims)
-    r = np.power(s, 1.0 / porder)
+    r = []
+    if axis is None:
+        x = x.flatten()
+        if porder == np.inf:
+            r = np.amax(np.abs(x))
+        elif porder == -np.inf:
+            r = np.amin(np.abs(x))
+        else:
+            r = np.linalg.norm(x, ord=porder)
+    elif isinstance(axis, list or tuple) and len(axis) == 2:
+        if porder == np.inf:
+            axis = tuple(axis)
+            r = np.amax(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == -np.inf:
+            axis = tuple(axis)
+            r = np.amin(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == 0:
+            axis = tuple(axis)
+            r = x.astype(bool)
+            r = np.sum(r, axis)
+        elif porder == 1:
+            axis = tuple(axis)
+            r = np.sum(np.abs(x), axis)
+        else:
+            axis = tuple(axis)
+            xp = np.power(np.abs(x), porder)
+            s = np.sum(xp, axis=axis, keepdims=keepdims)
+            r = np.power(s, 1.0 / porder)
+    else:
+        if isinstance(axis, list):
+            axis = tuple(axis)
+        r = np.linalg.norm(
+            x, ord=porder, axis=axis, keepdims=keepdims).astype(x.dtype)
+
     return r
 
 
 def frobenius_norm(x, axis=None, keepdims=False):
     if isinstance(axis, list): axis = tuple(axis)
     if axis is None: axis = (-2, -1)
-    r = np.linalg.norm(x, ord='fro', axis=axis, keepdims=keepdims)
+    r = np.linalg.norm(
+        x, ord='fro', axis=axis, keepdims=keepdims).astype(x.dtype)
     return r
 
 
@@ -89,6 +120,7 @@ class TestPnormOp(OpTest):
             'porder': float(self.porder)
         }
         self.outputs = {'Out': norm}
+        self.gradient = self.calc_gradient()
 
     def test_check_output(self):
         self.check_output()
@@ -104,6 +136,34 @@ class TestPnormOp(OpTest):
         self.keepdim = False
         self.dtype = "float64"
 
+    def calc_gradient(self):
+        self.attrs = {
+            'epsilon': self.epsilon,
+            'axis': self.axis,
+            'keepdim': self.keepdim,
+            'porder': float(self.porder)
+        }
+        x = self.inputs["X"]
+        porder = self.attrs["porder"]
+        axis = self.attrs["axis"]
+        if porder == 0:
+            grad = np.zeros(x.shape).astype(x.dtype)
+        elif porder in [float("inf"), float("-inf")]:
+            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            x_abs = np.abs(x)
+            grad = np.sign(x)
+            grad[x_abs != norm] = 0.0
+        else:
+            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            grad = np.power(norm, 1 - porder) * np.power(
+                np.abs(x), porder - 1) * np.sign(x)
+
+        numel = 1
+        for s in x.shape:
+            numel *= s
+        numel /= x.shape[axis]
+        return [grad.astype(x.dtype) * 1 / numel]
+
 
 class TestPnormOp2(TestPnormOp):
     def init_test_case(self):
@@ -118,22 +178,49 @@ class TestPnormOp2(TestPnormOp):
         self.check_grad(['X'], 'Out')
 
 
-def run_out(self, p, axis, shape_x, shape_y, dtype):
-    with fluid.program_guard(fluid.Program()):
-        data1 = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        data2 = fluid.data(name="Y", shape=shape_y, dtype=dtype)
-        out = paddle.norm(input=data1, p=p, axis=axis, out=data2)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        result = exe.run(feed={"X": np.random.rand(*shape_x).astype(dtype)},
-                         fetch_list=[data2, out])
-        self.assertEqual((result[0] == result[1]).all(), True)
+class TestPnormOp3(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = np.inf
+        self.keepdim = True
+        self.dtype = "float32"
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
+
+
+class TestPnormOp4(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = -np.inf
+        self.keepdim = True
+        self.dtype = "float32"
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
+
+
+class TestPnormOp5(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = 0
+        self.keepdim = True
+        self.dtype = "float32"
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
 
 
 def run_fro(self, p, axis, shape_x, dtype):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(input=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
@@ -145,31 +232,72 @@ def run_fro(self, p, axis, shape_x, dtype):
 def run_pnorm(self, p, axis, shape_x, dtype):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(input=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
         expected_result = p_norm(np_input, porder=p, axis=axis).astype(dtype)
         result, = exe.run(feed={"X": np_input}, fetch_list=[out])
-    self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+        self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+
+
+def run_graph(self, p, axis, shape_x, dtype):
+    paddle.disable_static()
+    shape = [2, 3, 4]
+    np_input = np.arange(24).astype('float32') - 12
+    np_input = np_input.reshape(shape)
+    x = paddle.to_tensor(np_input)
+    #[[[-12. -11. -10.  -9.] [ -8.  -7.  -6.  -5.] [ -4.  -3.  -2.  -1.]]
+    # [[  0.   1.   2.   3.] [  4.   5.   6.   7.] [  8.   9.  10.  11.]]]
+    out_pnorm = paddle.norm(x, p=2, axis=-1)
+
+    # compute frobenius norm along last two dimensions.
+    out_fro = paddle.norm(x, p='fro')
+    out_fro = paddle.norm(x, p='fro', axis=[0, 1])
+    # compute 2-order  norm along [0,1] dimension.
+    out_pnorm = paddle.norm(x, p=2, axis=[0, 1])
+    out_pnorm = paddle.norm(x, p=2)
+    #out_pnorm = [17.43559577 16.91153453 16.73320053 16.91153453]
+    # compute inf-order  norm
+    out_pnorm = paddle.norm(x, p=np.inf)
+    #out_pnorm = [12.]
+    out_pnorm = paddle.norm(x, p=np.inf, axis=0)
+    #out_pnorm = [[0. 1. 2. 3.] [4. 5. 6. 5.] [4. 3. 2. 1.]]
+
+    # compute -inf-order  norm
+    out_pnorm = paddle.norm(x, p=-np.inf)
+    #out_pnorm = [0.]
+    out_pnorm = paddle.norm(x, p=-np.inf, axis=0)
+    # out_fro = [17.43559577 16.91153453 16.73320053 16.91153453]
+    paddle.enable_static()
 
 
 class API_NormTest(unittest.TestCase):
-    def test_output_result(self):
-        run_out(self, p=2, axis=1, shape_x=[3, 4], shape_y=[3], dtype="float32")
-        run_out(
-            self,
-            p='fro',
-            axis=None,
-            shape_x=[3, 4],
-            shape_y=[1],
-            dtype="float32")
-
     def test_basic(self):
-        run_fro(self, p='fro', axis=None, shape_x=[3, 3, 4], dtype="float32")
-        run_fro(self, p='fro', axis=[0, 1], shape_x=[3, 3, 4], dtype="float64")
+        run_fro(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
+        run_fro(self, p='fro', axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
         run_pnorm(self, p=2, axis=None, shape_x=[3, 4], dtype="float32")
         run_pnorm(self, p=2, axis=1, shape_x=[3, 4], dtype="float64")
+        run_pnorm(self, p=np.inf, axis=0, shape_x=[2, 3, 4], dtype="float32")
+        run_pnorm(self, p=np.inf, axis=None, shape_x=[2, 3, 4], dtype="float32")
+        run_pnorm(self, p=-np.inf, axis=0, shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(
+            self, p=-np.inf, axis=None, shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(self, p=0, axis=1, shape_x=[3, 4], dtype="float64")
+
+        run_pnorm(self, p=1, axis=1, shape_x=[3, 4], dtype="float64")
+        run_pnorm(self, p=0, axis=None, shape_x=[3, 4], dtype="float64")
+        run_pnorm(self, p=2, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(self, p=2, axis=-1, shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(self, p=1, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(self, p=0, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(
+            self, p=np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(
+            self, p=-np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+
+    def test_dygraph(self):
+        run_graph(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
 
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
@@ -197,11 +325,7 @@ class API_NormTest(unittest.TestCase):
             self.assertRaises(ValueError, paddle.norm, data, p="unsupport norm")
             self.assertRaises(ValueError, paddle.norm, data, p=[1])
             self.assertRaises(ValueError, paddle.norm, data, p=[1], axis=-1)
-            self.assertRaises(
-                ValueError, paddle.norm, data, p='unspport', axis=[-2, -1])
             data = fluid.data(name="data_3d", shape=[2, 2, 2], dtype="float64")
-            self.assertRaises(
-                ValueError, paddle.norm, data, p='unspport', axis=[-2, -1])
             self.assertRaises(
                 ValueError, paddle.norm, data, p='unspport', axis=[-3, -2, -1])
 
diff --git a/python/paddle/fluid/tests/unittests/test_normal.py b/python/paddle/fluid/tests/unittests/test_normal.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e6855feaf491727203063f5c75c68301abbe05e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_normal.py
@@ -0,0 +1,197 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+import copy
+
+np.random.seed(10)
+
+
+class TestNormalAPI(unittest.TestCase):
+    def setUp(self):
+        self.mean = 1.0
+        self.std = 0.0
+        self.shape = None
+        self.repeat_num = 2000
+        self.set_attrs()
+        self.dtype = self.get_dtype()
+        self.place=paddle.CUDAPlace(0) \
+            if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def set_attrs(self):
+        self.shape = [8, 12]
+
+    def get_shape(self):
+        if isinstance(self.mean, np.ndarray):
+            shape = self.mean.shape
+        elif isinstance(self.std, np.ndarray):
+            shape = self.std.shape
+        else:
+            shape = self.shape
+        return list(shape)
+
+    def get_dtype(self):
+        if isinstance(self.mean, np.ndarray):
+            return self.mean.dtype
+        elif isinstance(self.std, np.ndarray):
+            return self.std.dtype
+        else:
+            return 'float32'
+
+    def static_api(self):
+        shape = self.get_shape()
+        ret_all_shape = copy.deepcopy(shape)
+        ret_all_shape.insert(0, self.repeat_num)
+        ret_all = np.zeros(ret_all_shape, self.dtype)
+        if isinstance(self.mean, np.ndarray) \
+            and isinstance(self.std, np.ndarray):
+            with paddle.static.program_guard(paddle.static.Program()):
+                mean = paddle.data('Mean', self.mean.shape, self.mean.dtype)
+                std = paddle.data('Std', self.std.shape, self.std.dtype)
+                out = paddle.normal(mean, std, self.shape)
+
+                exe = paddle.static.Executor(self.place)
+                for i in range(self.repeat_num):
+                    ret = exe.run(feed={
+                        'Mean': self.mean,
+                        'Std': self.std.reshape(shape)
+                    },
+                                  fetch_list=[out])
+                    ret_all[i] = ret[0]
+            return ret_all
+        elif isinstance(self.mean, np.ndarray):
+            with paddle.static.program_guard(paddle.static.Program()):
+                mean = paddle.data('Mean', self.mean.shape, self.mean.dtype)
+                out = paddle.normal(mean, self.std, self.shape)
+
+                exe = paddle.static.Executor(self.place)
+                for i in range(self.repeat_num):
+                    ret = exe.run(feed={'Mean': self.mean}, fetch_list=[out])
+                    ret_all[i] = ret[0]
+            return ret_all
+        elif isinstance(self.std, np.ndarray):
+            with paddle.static.program_guard(paddle.static.Program()):
+                std = paddle.data('Std', self.std.shape, self.std.dtype)
+                out = paddle.normal(self.mean, std, self.shape)
+
+                exe = paddle.static.Executor(self.place)
+                for i in range(self.repeat_num):
+                    ret = exe.run(feed={'Std': self.std}, fetch_list=[out])
+                    ret_all[i] = ret[0]
+            return ret_all
+        else:
+            with paddle.static.program_guard(paddle.static.Program()):
+                out = paddle.normal(self.mean, self.std, self.shape)
+
+                exe = paddle.static.Executor(self.place)
+                for i in range(self.repeat_num):
+                    ret = exe.run(fetch_list=[out])
+                    ret_all[i] = ret[0]
+            return ret_all
+
+    def dygraph_api(self):
+        paddle.disable_static(self.place)
+        shape = self.get_shape()
+        ret_all_shape = copy.deepcopy(shape)
+        ret_all_shape.insert(0, self.repeat_num)
+        ret_all = np.zeros(ret_all_shape, self.dtype)
+
+        mean = paddle.to_tensor(self.mean) \
+            if isinstance(self.mean, np.ndarray) else self.mean
+        std = paddle.to_tensor(self.std) \
+            if isinstance(self.std, np.ndarray) else self.std
+        for i in range(self.repeat_num):
+            out = paddle.normal(mean, std, self.shape)
+            ret_all[i] = out.numpy()
+        paddle.enable_static()
+        return ret_all
+
+    def test_api(self):
+        ret_static = self.static_api()
+        ret_dygraph = self.dygraph_api()
+        for ret in [ret_static, ret_dygraph]:
+            shape_ref = self.get_shape()
+            self.assertEqual(shape_ref, list(ret[0].shape))
+
+            ret = ret.flatten().reshape([self.repeat_num, -1])
+            mean = np.mean(ret, axis=0)
+            std = np.std(ret, axis=0)
+            mean_ref=self.mean.reshape([1, -1]) \
+                if isinstance(self.mean, np.ndarray) else self.mean
+            std_ref=self.std.reshape([1, -1]) \
+                if isinstance(self.std, np.ndarray) else self.std
+            self.assertTrue(np.allclose(mean_ref, mean, 0.2, 0.2))
+            self.assertTrue(np.allclose(std_ref, std, 0.2, 0.2))
+
+
+class TestNormalAPI_mean_is_tensor(TestNormalAPI):
+    def set_attrs(self):
+        self.mean = np.random.uniform(-2, -1, [2, 3, 4, 5]).astype('float64')
+
+
+class TestNormalAPI_std_is_tensor(TestNormalAPI):
+    def set_attrs(self):
+        self.std = np.random.uniform(0.7, 1, [2, 3, 17]).astype('float64')
+
+
+class TestNormalAPI_mean_std_are_tensor(TestNormalAPI):
+    def set_attrs(self):
+        self.mean = np.random.uniform(1, 2, [1, 100]).astype('float64')
+        self.std = np.random.uniform(0.5, 1, [1, 100]).astype('float64')
+
+
+class TestNormalAPI_mean_std_are_tensor_with_different_dtype(TestNormalAPI):
+    def set_attrs(self):
+        self.mean = np.random.uniform(1, 2, [100]).astype('float64')
+        self.std = np.random.uniform(1, 2, [100]).astype('float32')
+
+
+class TestNormalAlias(unittest.TestCase):
+    def test_alias(self):
+        paddle.disable_static()
+        shape = [1, 2, 3]
+        out1 = paddle.normal(shape=shape)
+        out2 = paddle.tensor.normal(shape=shape)
+        out3 = paddle.tensor.random.normal(shape=shape)
+        paddle.enable_static()
+
+
+class TestNormalErrors(unittest.TestCase):
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            mean = [1, 2, 3]
+            self.assertRaises(TypeError, paddle.normal, mean)
+
+            std = [1, 2, 3]
+            self.assertRaises(TypeError, paddle.normal, std=std)
+
+            mean = paddle.data('Mean', [100], 'int32')
+            self.assertRaises(TypeError, paddle.normal, mean)
+
+            std = paddle.data('Std', [100], 'int32')
+            self.assertRaises(TypeError, paddle.normal, mean=1.0, std=std)
+
+            self.assertRaises(TypeError, paddle.normal, shape=1)
+
+            self.assertRaises(TypeError, paddle.normal, shape=[1.0])
+
+            shape = paddle.data('Shape', [100], 'float32')
+            self.assertRaises(TypeError, paddle.normal, shape=shape)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_normalize.py b/python/paddle/fluid/tests/unittests/test_normalize.py
index 6595a29b24ae23c9b38538035c9593ba77eecdb7..614e0e897613b235e2ec6fa72cfaf1057e7d5bbd 100644
--- a/python/paddle/fluid/tests/unittests/test_normalize.py
+++ b/python/paddle/fluid/tests/unittests/test_normalize.py
@@ -23,8 +23,6 @@ import numpy as np
 
 
 def p_normalize(x, axis=1, p=2, epsilon=1e-12, keepdims=True):
-    if len(x.shape) == 1:
-        axis = 0
     xp = np.power(np.abs(x), p)
     s = np.sum(xp, axis=axis, keepdims=keepdims)
     r = np.maximum(np.power(s, 1.0 / p), epsilon)
@@ -38,10 +36,10 @@ class TestNNFunctionalNormalize(unittest.TestCase):
         self.expected0 = p_normalize(self.input_np)
         self.expected1 = p_normalize(self.input_np, p=1.5)
         self.expected2 = p_normalize(self.input_np, axis=0)
-        self.expected3 = p_normalize(self.input_np2)
+        self.expected3 = p_normalize(self.input_np2, axis=0)
 
     def run_imperative(self):
-        x = paddle.to_variable(self.input_np)
+        x = paddle.to_tensor(self.input_np)
         y = F.normalize(x)
         self.assertTrue(np.allclose(y.numpy(), self.expected0))
 
@@ -51,10 +49,12 @@ class TestNNFunctionalNormalize(unittest.TestCase):
         y = F.normalize(x, axis=0)
         self.assertTrue(np.allclose(y.numpy(), self.expected2))
 
-        x = paddle.to_variable(self.input_np2)
-        y = F.normalize(x)
+        x = paddle.to_tensor(self.input_np2)
+        y = F.normalize(x, axis=0)
         self.assertTrue(np.allclose(y.numpy(), self.expected3))
 
+        self.assertRaises(BaseException, F.normalize, x)
+
     def run_static(self, use_gpu=False):
         x = paddle.data(name='input', shape=[10, 10], dtype='float32')
         x2 = paddle.data(name='input2', shape=[2], dtype='float32')
@@ -62,7 +62,7 @@ class TestNNFunctionalNormalize(unittest.TestCase):
         result1 = F.normalize(x, p=1.5)
         result2 = F.normalize(x, axis=0)
         result3 = F.normalize(x, name='aaa')
-        result4 = F.normalize(x2)
+        result4 = F.normalize(x2, axis=0)
 
         place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -77,6 +77,7 @@ class TestNNFunctionalNormalize(unittest.TestCase):
         self.assertTrue(np.allclose(static_result[2], self.expected2))
         self.assertTrue('aaa' in result3.name)
         self.assertTrue(np.allclose(static_result[3], self.expected3))
+        self.assertRaises(ValueError, F.normalize, x2)
 
     def test_cpu(self):
         paddle.disable_static(place=paddle.fluid.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_numel_op.py b/python/paddle/fluid/tests/unittests/test_numel_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8512bc99e7451c73e5513b834fb6aa448717c646
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
@@ -0,0 +1,101 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import functools
+import paddle
+
+
+class TestNumelOp(OpTest):
+    def setUp(self):
+        self.op_type = "size"
+        self.init()
+        x = np.random.random((self.shape)).astype("float64")
+        self.inputs = {'Input': x, }
+        self.outputs = {'Out': np.array([np.size(x)])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init(self):
+        self.shape = (6, 56, 8, 55)
+
+
+class TestNumelOp1(TestNumelOp):
+    def init(self):
+        self.shape = (11, 66)
+
+
+class TestNumelOp2(TestNumelOp):
+    def init(self):
+        self.shape = (0, )
+
+
+class TestNumelOoAPI(unittest.TestCase):
+    def test_numel_static(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            shape1 = [2, 1, 4, 5]
+            shape2 = [1, 4, 5]
+            x_1 = paddle.data(shape=shape1, dtype='int32', name='x_1')
+            x_2 = paddle.data(shape=shape2, dtype='int32', name='x_2')
+            input_1 = np.random.random(shape1).astype("int32")
+            input_2 = np.random.random(shape2).astype("int32")
+            out_1 = paddle.numel(x_1)
+            out_2 = paddle.numel(x_2)
+            exe = paddle.static.Executor(place=paddle.CPUPlace())
+            res_1, res_2 = exe.run(feed={
+                "x_1": input_1,
+                "x_2": input_2,
+            },
+                                   fetch_list=[out_1, out_2])
+            assert (np.array_equal(
+                res_1, np.array([np.size(input_1)]).astype("int64")))
+            assert (np.array_equal(
+                res_2, np.array([np.size(input_2)]).astype("int64")))
+
+    def test_numel_imperative(self):
+        paddle.disable_static(paddle.CPUPlace())
+        input_1 = np.random.random([2, 1, 4, 5]).astype("int32")
+        input_2 = np.random.random([1, 4, 5]).astype("int32")
+        x_1 = paddle.to_variable(input_1)
+        x_2 = paddle.to_variable(input_2)
+        out_1 = paddle.numel(x_1)
+        out_2 = paddle.numel(x_2)
+        assert (np.array_equal(out_1.numpy().item(0), np.size(input_1)))
+        assert (np.array_equal(out_2.numpy().item(0), np.size(input_2)))
+        paddle.enable_static()
+
+    def test_error(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+
+            def test_x_type():
+                shape = [1, 4, 5]
+                input_1 = np.random.random(shape).astype("int32")
+                out_1 = paddle.numel(input_1)
+
+            self.assertRaises(TypeError, test_x_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pad3d_op.py b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..11719a9c4a92807375c1fdfcc7e168dccc5e522c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
@@ -0,0 +1,713 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.fluid.core as core
+
+from paddle.fluid import Program, program_guard, Executor, default_main_program
+
+
+class TestPad3dOp(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        self.value = 0.0
+        self.variable_paddings = False
+        self.initTestCase()
+        self.op_type = "pad3d"
+        self.inputs = {'X': np.random.random(self.shape).astype("float64")}
+        self.attrs = {}
+        if self.variable_paddings:
+            self.attrs['paddings'] = []
+            self.inputs['Paddings'] = np.array(self.paddings).flatten().astype(
+                "int32")
+        else:
+            self.attrs['paddings'] = np.array(self.paddings).flatten().astype(
+                "int32")
+        self.attrs['value'] = self.value
+        self.attrs['mode'] = self.mode
+        self.attrs['data_format'] = self.data_format
+        if self.data_format == "NCDHW":
+            paddings = [
+                (0, 0),
+                (0, 0),
+                (self.paddings[4], self.paddings[5]),
+                (self.paddings[2], self.paddings[3]),
+                (self.paddings[0], self.paddings[1]),
+            ]
+        else:
+            paddings = [
+                (0, 0),
+                (self.paddings[4], self.paddings[5]),
+                (self.paddings[2], self.paddings[3]),
+                (self.paddings[0], self.paddings[1]),
+                (0, 0),
+            ]
+        if self.mode == "constant":
+            out = np.pad(self.inputs['X'],
+                         paddings,
+                         mode=self.mode,
+                         constant_values=self.value)
+        elif self.mode == "reflect":
+            out = np.pad(self.inputs['X'], paddings, mode=self.mode)
+        elif self.mode == "replicate":
+            out = np.pad(self.inputs['X'], paddings, mode="edge")
+        elif self.mode == "circular":
+            out = np.pad(self.inputs['X'], paddings, mode="wrap")
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out')
+
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 0, 0, 0, 0, 0]
+        self.mode = "constant"
+        self.data_format = "NCDHW"
+        self.pad_value = 0.0
+
+
+class TestCase1(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 2, 3, 4, 5]
+        self.mode = "constant"
+        self.data_format = "NCDHW"
+        self.value = 1.0
+
+
+class TestCase2(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [1, 1, 1, 1, 1, 1]
+        self.mode = "constant"
+        self.data_format = "NDHWC"
+        self.value = 1.0
+
+
+class TestCase3(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 1, 0, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NCDHW"
+
+
+class TestCase4(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (4, 4, 4, 4, 4)
+        self.paddings = [0, 1, 2, 1, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NDHWC"
+
+
+class TestCase5(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 2, 3, 2, 1]
+        self.mode = "replicate"
+        self.data_format = "NCDHW"
+
+
+class TestCase6(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (4, 4, 4, 4, 4)
+        self.paddings = [5, 4, 2, 1, 2, 3]
+        self.mode = "replicate"
+        self.data_format = "NDHWC"
+
+
+class TestCase7(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 2, 3, 2, 1]
+        self.mode = "circular"
+        self.data_format = "NCDHW"
+
+
+class TestCase8(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (4, 4, 4, 4, 4)
+        self.paddings = [0, 1, 2, 1, 2, 3]
+        self.mode = "circular"
+        self.data_format = "NDHWC"
+
+
+class TestPadAPI(unittest.TestCase):
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def check_static_result_1(self, place):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_shape = (1, 2, 3, 4, 5)
+            pad = [1, 2, 1, 1, 3, 4]
+            mode = "constant"
+            value = 100
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            result = F.pad(x=x,
+                           pad=pad,
+                           value=value,
+                           mode=mode,
+                           data_format="NCDHW")
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x": input_data},
+                              fetch_list=[result])
+
+            np_out = self._get_numpy_out(input_data, pad, mode, value)
+            self.assertTrue(np.allclose(fetches[0], np_out))
+
+    def check_static_result_2(self, place):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_shape = (2, 3, 4, 5, 6)
+            pad = [1, 2, 1, 1, 1, 2]
+            mode = "reflect"
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW")
+            result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC")
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x": input_data},
+                              fetch_list=[result1, result2])
+
+            np_out1 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NCDHW")
+            np_out2 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NDHWC")
+            self.assertTrue(np.allclose(fetches[0], np_out1))
+            self.assertTrue(np.allclose(fetches[1], np_out2))
+
+    def check_static_result_3(self, place):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_shape = (2, 3, 4, 5, 6)
+            pad = [1, 2, 1, 1, 3, 4]
+            mode = "replicate"
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW")
+            result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC")
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x": input_data},
+                              fetch_list=[result1, result2])
+
+            np_out1 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NCDHW")
+            np_out2 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NDHWC")
+            self.assertTrue(np.allclose(fetches[0], np_out1))
+            self.assertTrue(np.allclose(fetches[1], np_out2))
+
+    def check_static_result_4(self, place):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_shape = (2, 3, 4, 5, 6)
+            pad = [1, 2, 1, 1, 3, 4]
+            mode = "circular"
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW")
+            result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC")
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x": input_data},
+                              fetch_list=[result1, result2])
+
+            np_out1 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NCDHW")
+            np_out2 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NDHWC")
+            self.assertTrue(np.allclose(fetches[0], np_out1))
+            self.assertTrue(np.allclose(fetches[1], np_out2))
+
+    def _get_numpy_out(self,
+                       input_data,
+                       pad,
+                       mode,
+                       value=0,
+                       data_format="NCDHW"):
+        if data_format == "NCDHW":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[4], pad[5]),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+            ]
+        elif data_format == "NDHWC":
+            pad = [
+                (0, 0),
+                (pad[4], pad[5]),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+        elif data_format == "NCHW":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+            ]
+        elif data_format == "NHWC":
+            pad = [
+                (0, 0),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+        elif data_format == "NCL":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[0], pad[1]),
+            ]
+        elif data_format == "NLC":
+            pad = [
+                (0, 0),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+
+        if mode == "constant":
+            out = np.pad(input_data, pad, mode=mode, constant_values=value)
+        elif mode == "reflect":
+            out = np.pad(input_data, pad, mode=mode)
+        elif mode == "replicate":
+            out = np.pad(input_data, pad, mode="edge")
+        elif mode == "circular":
+            out = np.pad(input_data, pad, mode="wrap")
+
+        return out
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result_1(place=place)
+            self.check_static_result_2(place=place)
+            self.check_static_result_3(place=place)
+            self.check_static_result_4(place=place)
+
+    def test_dygraph_1(self):
+        paddle.disable_static()
+
+        input_shape = (1, 2, 3, 4, 5)
+        pad = [1, 2, 1, 1, 3, 4]
+        mode = "constant"
+        value = 100
+        input_data = np.random.rand(*input_shape).astype(np.float32)
+        np_out1 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NCDHW")
+        np_out2 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NDHWC")
+        tensor_data = paddle.to_tensor(input_data)
+
+        y1 = F.pad(tensor_data,
+                   pad=pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NCDHW")
+        y2 = F.pad(tensor_data,
+                   pad=pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NDHWC")
+
+        self.assertTrue(np.allclose(y1.numpy(), np_out1))
+        self.assertTrue(np.allclose(y2.numpy(), np_out2))
+
+    def test_dygraph_2(self):
+        paddle.disable_static()
+
+        input_shape = (2, 3, 4, 5)
+        pad = [1, 1, 3, 4]
+        mode = "constant"
+        value = 100
+        input_data = np.random.rand(*input_shape).astype(np.float32)
+        np_out1 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NCHW")
+        np_out2 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NHWC")
+
+        tensor_data = paddle.to_tensor(input_data)
+        tensor_pad = paddle.to_tensor(pad, dtype="int32")
+
+        y1 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NCHW")
+        y2 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NHWC")
+
+        self.assertTrue(np.allclose(y1.numpy(), np_out1))
+        self.assertTrue(np.allclose(y2.numpy(), np_out2))
+
+    def test_dygraph_2(self):
+        paddle.disable_static()
+
+        input_shape = (2, 3, 4, 5)
+        pad = [1, 1, 3, 4]
+        mode = "constant"
+        value = 100
+        input_data = np.random.rand(*input_shape).astype(np.float32)
+        np_out1 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NCHW")
+        np_out2 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NHWC")
+        tensor_data = paddle.to_tensor(input_data)
+        tensor_pad = paddle.to_tensor(pad, dtype="int32")
+
+        y1 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NCHW")
+        y2 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NHWC")
+
+        self.assertTrue(np.allclose(y1.numpy(), np_out1))
+        self.assertTrue(np.allclose(y2.numpy(), np_out2))
+
+    def test_dygraph_3(self):
+        paddle.disable_static()
+
+        input_shape = (3, 4, 5)
+        pad = [3, 4]
+        mode = "constant"
+        value = 100
+        input_data = np.random.rand(*input_shape).astype(np.float32)
+        np_out1 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NCL")
+        np_out2 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NLC")
+        tensor_data = paddle.to_tensor(input_data)
+        tensor_pad = paddle.to_tensor(pad, dtype="int32")
+
+        y1 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NCL")
+        y2 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NLC")
+
+        self.assertTrue(np.allclose(y1.numpy(), np_out1))
+        self.assertTrue(np.allclose(y2.numpy(), np_out2))
+
+
+class TestPad1dAPI(unittest.TestCase):
+    def _get_numpy_out(self,
+                       input_data,
+                       pad,
+                       mode,
+                       value=0.0,
+                       data_format="NCL"):
+        if data_format == "NCL":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[0], pad[1]),
+            ]
+        else:
+            pad = [
+                (0, 0),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+
+        if mode == "constant":
+            out = np.pad(input_data, pad, mode=mode, constant_values=value)
+        elif mode == "reflect":
+            out = np.pad(input_data, pad, mode=mode)
+        elif mode == "replicate":
+            out = np.pad(input_data, pad, mode="edge")
+
+        return out
+
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_class(self):
+        paddle.disable_static()
+        for place in self.places:
+            input_shape = (3, 4, 5)
+            pad = [1, 2]
+            value = 100
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+
+            pad_reflection = nn.ReflectionPad1d(padding=pad)
+            pad_replication = nn.ReplicationPad1d(padding=pad)
+            pad_constant = nn.ConstantPad1d(padding=pad, value=value)
+
+            data = paddle.to_tensor(input_data)
+
+            output = pad_reflection(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "reflect", data_format="NCL")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_replication(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "replicate", data_format="NCL")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_constant(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "constant", value=value, data_format="NCL")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+
+class TestPad2dAPI(unittest.TestCase):
+    def _get_numpy_out(self,
+                       input_data,
+                       pad,
+                       mode,
+                       value=0.0,
+                       data_format="NCHW"):
+        if data_format == "NCHW":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+            ]
+        else:
+            pad = [
+                (0, 0),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+
+        if mode == "constant":
+            out = np.pad(input_data, pad, mode=mode, constant_values=value)
+        elif mode == "reflect":
+            out = np.pad(input_data, pad, mode=mode)
+        elif mode == "replicate":
+            out = np.pad(input_data, pad, mode="edge")
+
+        return out
+
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_class(self):
+        paddle.disable_static()
+        for place in self.places:
+            input_shape = (3, 4, 5, 6)
+            pad = [1, 2, 2, 1]
+            value = 100
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+
+            pad_reflection = nn.ReflectionPad2d(padding=pad)
+            pad_replication = nn.ReplicationPad2d(padding=pad)
+            pad_constant = nn.ConstantPad2d(padding=pad, value=value)
+            pad_zero = nn.ZeroPad2d(padding=pad)
+
+            data = paddle.to_tensor(input_data)
+
+            output = pad_reflection(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "reflect", data_format="NCHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_replication(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "replicate", data_format="NCHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_constant(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "constant", value=value, data_format="NCHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_zero(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "constant", value=0, data_format="NCHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+
+class TestPad3dAPI(unittest.TestCase):
+    def _get_numpy_out(self,
+                       input_data,
+                       pad,
+                       mode,
+                       value=0.0,
+                       data_format="NCDHW"):
+        if data_format == "NCDHW":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[4], pad[5]),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+            ]
+        else:
+            pad = [
+                (0, 0),
+                (pad[4], pad[5]),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+
+        if mode == "constant":
+            out = np.pad(input_data, pad, mode=mode, constant_values=value)
+        elif mode == "reflect":
+            out = np.pad(input_data, pad, mode=mode)
+        elif mode == "replicate":
+            out = np.pad(input_data, pad, mode="edge")
+
+        return out
+
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_class(self):
+        paddle.disable_static()
+        for place in self.places:
+            input_shape = (3, 4, 5, 6, 7)
+            pad = [1, 2, 2, 1, 1, 0]
+            value = 100
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+
+            pad_replication = nn.ReplicationPad3d(padding=pad)
+            pad_constant = nn.ConstantPad3d(padding=pad, value=value)
+
+            data = paddle.to_tensor(input_data)
+
+            output = pad_replication(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "replicate", data_format="NCDHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_constant(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "constant", value=value, data_format="NCDHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+
+class TestPad3dOpError(unittest.TestCase):
+    def test_errors(self):
+        def test_variable():
+            input_shape = (1, 2, 3, 4, 5)
+            data = np.random.rand(*input_shape).astype(np.float32)
+            F.pad(x=data, paddings=[1, 1, 1, 1, 1, 1])
+
+        def test_reflect_1():
+            input_shape = (1, 2, 3, 4, 5)
+            data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            y = F.pad(x, pad=[5, 6, 1, 1, 1, 1], value=1, mode='reflect')
+            place = paddle.CPUPlace()
+            exe = Executor(place)
+            outputs = exe.run(feed={'x': data}, fetch_list=[y.name])
+
+        def test_reflect_2():
+            input_shape = (1, 2, 3, 4, 5)
+            data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            y = F.pad(x, pad=[1, 1, 4, 3, 1, 1], value=1, mode='reflect')
+            place = paddle.CPUPlace()
+            exe = Executor(place)
+            outputs = exe.run(feed={'x': data}, fetch_list=[y.name])
+
+        def test_reflect_3():
+            input_shape = (1, 2, 3, 4, 5)
+            data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            y = F.pad(x, pad=[1, 1, 1, 1, 2, 3], value=1, mode='reflect')
+            place = paddle.CPUPlace()
+            exe = Executor(place)
+            outputs = exe.run(feed={'x': data}, fetch_list=[y.name])
+
+        self.assertRaises(TypeError, test_variable)
+
+        self.assertRaises(Exception, test_reflect_1)
+
+        self.assertRaises(Exception, test_reflect_2)
+
+        self.assertRaises(Exception, test_reflect_3)
+
+
+class TestPadDataformatError(unittest.TestCase):
+    def test_errors(self):
+        def test_ncl():
+            paddle.disable_static(paddle.CPUPlace())
+            input_shape = (1, 2, 3, 4)
+            pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
+            data = np.arange(
+                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad1d(padding=pad, data_format="NCL")
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+
+        def test_nchw():
+            paddle.disable_static(paddle.CPUPlace())
+            input_shape = (1, 2, 4)
+            pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
+            data = np.arange(
+                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad1d(padding=pad, data_format="NCHW")
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+
+        def test_ncdhw():
+            paddle.disable_static(paddle.CPUPlace())
+            input_shape = (1, 2, 3, 4)
+            pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
+            data = np.arange(
+                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad1d(padding=pad, data_format="NCDHW")
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+
+        self.assertRaises(AssertionError, test_ncl)
+
+        self.assertRaises(AssertionError, test_nchw)
+
+        self.assertRaises(AssertionError, test_ncdhw)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
index 858d56c1fc04f61c9dd281a633f7be9aceff8338..2ffe523ef6dda18a24813e702a1892c335ba6a68 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
@@ -52,8 +52,6 @@ class TestDygraphDoubleGrad(TestCase):
              retain_graph=None,
              create_graph=False,
              allow_unused=False):
-        backward_strategy = fluid.dygraph.BackwardStrategy()
-        backward_strategy.sort_sum_gradient = self.sort_sum_gradient
         return paddle.grad(
             outputs=outputs,
             inputs=inputs,
@@ -61,8 +59,7 @@ class TestDygraphDoubleGrad(TestCase):
             no_grad_vars=no_grad_vars,
             retain_graph=retain_graph,
             create_graph=create_graph,
-            allow_unused=allow_unused,
-            backward_strategy=backward_strategy)
+            allow_unused=allow_unused)
 
     @dygraph_guard
     def test_exception(self):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index 5677157fde8d718cbeb6691849421efbbb47dc10..bac196b1ab52b604a85321a5473d455d2616bf0d 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -13,11 +13,16 @@
 # limitations under the License.
 
 from __future__ import print_function
+
+import os
+import sys
 import unittest
-from test_dist_base import TestDistBase
+
 import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_mnist import TestMnist
 
-import os
 flag_name = os.path.splitext(__file__)[0]
 
 
@@ -36,5 +41,11 @@ class TestParallelDygraphMnist(TestDistBase):
                 log_name=flag_name)
 
 
+class TestParallelDygraphMnistSpawn(TestDistSpawnRunner):
+    def test_mnist_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(test_class=TestMnist, delta=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
index 8c5cdf8321a4bdff8981b3b0dafed66d067a41e3..cf89dc484c4880092962eb0322b98bac1d15dcd3 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
@@ -13,11 +13,16 @@
 # limitations under the License.
 
 from __future__ import print_function
+
+import os
+import sys
 import unittest
-from test_dist_base import TestDistBase
+
 import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_se_resnext import TestSeResNeXt
 
-import os
 flag_name = os.path.splitext(__file__)[0]
 
 
@@ -36,5 +41,12 @@ class TestParallelDygraphSeResNeXt(TestDistBase):
                 log_name=flag_name)
 
 
+class TestParallelDygraphSeResNeXtSpawn(TestDistSpawnRunner):
+    def test_se_resnext_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestSeResNeXt, delta=0.01)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
index 40b5833053d29bb36c5f15aca23aadb761597838..7f051f1005c7b7591bd813e1c4677e383c439ed0 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
@@ -15,10 +15,13 @@
 from __future__ import print_function
 
 import os
+import sys
 import unittest
-import paddle.fluid as fluid
 
+import paddle.fluid as fluid
 from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_sparse_embedding import TestSparseEmbedding
 
 flag_name = os.path.splitext(__file__)[0]
 
@@ -38,5 +41,12 @@ class TestParallelDygraphSparseEmdedding(TestDistBase):
                 log_name=flag_name)
 
 
+class TestParallelDygraphSparseEmdeddingSpawn(TestDistSpawnRunner):
+    def test_sparse_embedding_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestSparseEmbedding, delta=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
index 5c34b35fc83a3de6e1a33a51dad1e4e264afd52b..7cf1e9711b74b31e15b732f87addbc9fa653152f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
@@ -25,7 +25,7 @@ class TestParallelDygraphMnist(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
-        self._dygraph = False  #True
+        self._dygraph = True
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
index 385c4d892a650bc18400f4f59cd2ed10bea24d70..c8d47eab2c5191e941d188845a927b5202af9738 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
@@ -15,10 +15,13 @@
 from __future__ import print_function
 
 import os
+import sys
 import unittest
-import paddle.fluid as fluid
 
+import paddle.fluid as fluid
 from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_transformer import TestTransformer
 
 flag_name = os.path.splitext(__file__)[0]
 
@@ -38,5 +41,12 @@ class TestParallelDygraphTransformer(TestDistBase):
                 log_name=flag_name)
 
 
+class TestParallelDygraphTransformerSpawn(TestDistSpawnRunner):
+    def test_transformer_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestTransformer, delta=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
index 0bcb4be3b7fb9380932cf137ac8e4939dcd77288..cf93f39ab8c5c92aa075f2f0a7dca9a5c5d9f485 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
@@ -16,16 +16,17 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+
 from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid.core as core
+import paddle.fluid as fluid
 
 
-class TestPixelShuffle(OpTest):
-    def setUp(self):
-        self.op_type = "pixel_shuffle"
-        n, c, h, w = 2, 9, 4, 4
-        up_factor = 3
-        shape = [n, c, h, w]
-        x = np.random.random(shape).astype("float64")
+def pixel_shuffle_np(x, up_factor, data_format="NCHW"):
+    if data_format == "NCHW":
+        n, c, h, w = x.shape
         new_shape = (n, c // (up_factor * up_factor), up_factor, up_factor, h,
                      w)
         # reshape to (num,output_channel,upscale_factor,upscale_factor,h,w)
@@ -34,10 +35,42 @@ class TestPixelShuffle(OpTest):
         npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
         oshape = [n, c // (up_factor * up_factor), h * up_factor, w * up_factor]
         npresult = np.reshape(npresult, oshape)
+        return npresult
+    else:
+        n, h, w, c = x.shape
+        new_shape = (n, h, w, c // (up_factor * up_factor), up_factor,
+                     up_factor)
+        # reshape to (num,h,w,output_channel,upscale_factor,upscale_factor)
+        npresult = np.reshape(x, new_shape)
+        # transpose to (num,h,upscale_factor,w,upscale_factor,output_channel)
+        npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
+        oshape = [n, h * up_factor, w * up_factor, c // (up_factor * up_factor)]
+        npresult = np.reshape(npresult, oshape)
+        return npresult
+
+
+class TestPixelShuffleOp(OpTest):
+    def setUp(self):
+        self.op_type = "pixel_shuffle"
+        self.init_data_format()
+        n, c, h, w = 2, 9, 4, 4
+
+        if self.format == "NCHW":
+            shape = [n, c, h, w]
+        if self.format == "NHWC":
+            shape = [n, h, w, c]
+
+        up_factor = 3
+
+        x = np.random.random(shape).astype("float64")
+        npresult = pixel_shuffle_np(x, up_factor, self.format)
 
         self.inputs = {'X': x}
         self.outputs = {'Out': npresult}
-        self.attrs = {'upscale_factor': up_factor}
+        self.attrs = {'upscale_factor': up_factor, "data_format": self.format}
+
+    def init_data_format(self):
+        self.format = "NCHW"
 
     def test_check_output(self):
         self.check_output()
@@ -46,5 +79,141 @@ class TestPixelShuffle(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestChannelLast(TestPixelShuffleOp):
+    def init_data_format(self):
+        self.format = "NHWC"
+
+
+class TestPixelShuffleAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64")
+        self.x_2_np = np.random.random([2, 4, 4, 9]).astype("float64")
+        self.out_1_np = pixel_shuffle_np(self.x_1_np, 3)
+        self.out_2_np = pixel_shuffle_np(self.x_2_np, 3, "NHWC")
+
+    def test_static_graph_functional(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.enable_static()
+            x_1 = paddle.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            out_1 = F.pixel_shuffle(x_1, 3)
+            out_2 = F.pixel_shuffle(x_2, 3, "NHWC")
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_1_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_2_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+
+            assert np.allclose(res_1, self.out_1_np)
+            assert np.allclose(res_2, self.out_2_np)
+
+    # same test between layer and functional in this op.
+    def test_static_graph_layer(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.enable_static()
+            x_1 = paddle.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            # init instance
+            ps_1 = paddle.nn.PixelShuffle(3)
+            ps_2 = paddle.nn.PixelShuffle(3, "NHWC")
+            out_1 = ps_1(x_1)
+            out_2 = ps_2(x_2)
+            out_1_np = pixel_shuffle_np(self.x_1_np, 3)
+            out_2_np = pixel_shuffle_np(self.x_2_np, 3, "NHWC")
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_1_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_2_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+
+            assert np.allclose(res_1, out_1_np)
+            assert np.allclose(res_2, out_2_np)
+
+    def run_dygraph(self, up_factor, data_format):
+
+        n, c, h, w = 2, 9, 4, 4
+
+        if data_format == "NCHW":
+            shape = [n, c, h, w]
+        if data_format == "NHWC":
+            shape = [n, h, w, c]
+
+        x = np.random.random(shape).astype("float64")
+
+        npresult = pixel_shuffle_np(x, up_factor, data_format)
+
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.disable_static(place=place)
+
+            pixel_shuffle = paddle.nn.PixelShuffle(
+                up_factor, data_format=data_format)
+            result = pixel_shuffle(paddle.to_tensor(x))
+
+            self.assertTrue(np.allclose(result.numpy(), npresult))
+
+            result_functional = F.pixel_shuffle(
+                paddle.to_tensor(x), 3, data_format)
+            self.assertTrue(np.allclose(result_functional.numpy(), npresult))
+
+    def test_dygraph1(self):
+        self.run_dygraph(3, "NCHW")
+
+    def test_dygraph2(self):
+        self.run_dygraph(3, "NHWC")
+
+
+class TestPixelShuffleError(unittest.TestCase):
+    def test_error_functional(self):
+        def error_upscale_factor():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                pixel_shuffle = F.pixel_shuffle(paddle.to_tensor(x), 3.33)
+
+        self.assertRaises(TypeError, error_upscale_factor)
+
+        def error_data_format():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                pixel_shuffle = F.pixel_shuffle(paddle.to_tensor(x), 3, "WOW")
+
+        self.assertRaises(ValueError, error_data_format)
+
+    def test_error_layer(self):
+        def error_upscale_factor_layer():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                ps = paddle.nn.PixelShuffle(3.33)
+
+        self.assertRaises(TypeError, error_upscale_factor_layer)
+
+        def error_data_format_layer():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                ps = paddle.nn.PixelShuffle(3, "MEOW")
+
+        self.assertRaises(ValueError, error_data_format_layer)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1a25ad3529e8b0a4126bc458838ecd876e5af30
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -0,0 +1,373 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def max_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        out[:, :, i] = np.max(x_masked, axis=(2))
+    return out
+
+
+def avg_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        field_size = (r_end - r_start) \
+            if (exclusive or adaptive) else (ksize[0])
+        if data_type == np.int8 or data_type == np.uint8:
+            out[:, :, i] = (np.rint(
+                np.sum(x_masked, axis=(2, 3)) / field_size)).astype(data_type)
+        else:
+            out[:, :, i] = (np.sum(x_masked, axis=(2)) /
+                            field_size).astype(data_type)
+    return out
+
+
+class TestPool1d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.avg_pool1d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0], ceil_mode=False)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.avg_pool1d(input, kernel_size=2, stride=2, padding=[0])
+
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool1d_dg = paddle.nn.layer.AvgPool1d(
+                kernel_size=2, stride=None, padding=0)
+            result = avg_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.max_pool1d(input, kernel_size=2, stride=2, padding=[0])
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.max_pool1d(input, kernel_size=2, stride=2, padding=0)
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool1d_dg = paddle.nn.layer.MaxPool1d(
+                kernel_size=2, stride=None, padding=0)
+            result = max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveAvgPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_adaptive_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_max_dygraph_padding_same(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.max_pool1d(
+                input, kernel_size=2, stride=2, padding="SAME")
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_dygraph_padding_same(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.avg_pool1d(
+                input, kernel_size=2, stride=2, padding="SAME")
+
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def test_pool1d(self):
+        for place in self.places:
+
+            self.check_max_dygraph_results(place)
+            self.check_avg_dygraph_results(place)
+            self.check_max_static_results(place)
+            self.check_avg_static_results(place)
+            self.check_adaptive_max_dygraph_results(place)
+            self.check_adaptive_avg_dygraph_results(place)
+            self.check_adaptive_max_static_results(place)
+            self.check_adaptive_avg_static_results(place)
+            self.check_max_dygraph_padding_same(place)
+            self.check_avg_dygraph_padding_same(place)
+
+
+class TestPool2dError_API(unittest.TestCase):
+    def test_error_api(self):
+        def run1():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[2]]
+                res_pd = F.max_pool1d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run1)
+
+        def run2():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[2]]
+                res_pd = F.max_pool1d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run2)
+
+        def run3():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "padding"
+                res_pd = F.max_pool1d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run3)
+
+        def run4():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = F.max_pool1d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True)
+
+        self.assertRaises(ValueError, run4)
+
+        def run5():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = F.max_pool1d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True)
+
+        self.assertRaises(ValueError, run5)
+
+        def run6():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = F.avg_pool1d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True)
+
+        self.assertRaises(ValueError, run6)
+
+        def run7():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "paddle"
+                res_pd = F.avg_pool1d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True)
+
+        self.assertRaises(ValueError, run7)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..93a2be6de342efc4e8284e7c352137d0a3a1bcb9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test_pool2d_op import adaptive_start_index, adaptive_end_index, pool2D_forward_naive
+import unittest
+from op_test import OpTest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.nn.functional import avg_pool2d, max_pool2d
+import paddle.fluid as fluid
+import paddle
+
+
+class TestPool2d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 32, 32], dtype="float32")
+            result = avg_pool2d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='avg')
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool2d(input, kernel_size=2, stride=2, padding=0)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='avg')
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 32, 32], dtype="float32")
+            result = max_pool2d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max')
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool2d(
+                input, kernel_size=2, stride=2, padding=0, return_indices=False)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max')
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_stride_is_none(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result, indices = max_pool2d(
+                input,
+                kernel_size=2,
+                stride=None,
+                padding="SAME",
+                return_indices=True)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max',
+                padding_algorithm="SAME")
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_dygraph_stride_is_none(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool2d(
+                input, kernel_size=2, stride=None, padding="SAME")
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='avg',
+                padding_algorithm="SAME")
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_padding(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            padding = [[0, 0], [0, 0], [0, 0], [0, 0]]
+            result = max_pool2d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=padding,
+                return_indices=False)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max')
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_divisor(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            padding = [[0, 0], [0, 0], [0, 0], [0, 0]]
+            result = avg_pool2d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=padding,
+                divisor_override=4)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='avg')
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def test_pool2d(self):
+        for place in self.places:
+
+            self.check_max_dygraph_results(place)
+            self.check_avg_dygraph_results(place)
+            self.check_max_static_results(place)
+            self.check_avg_static_results(place)
+            self.check_max_dygraph_stride_is_none(place)
+            self.check_avg_dygraph_stride_is_none(place)
+            self.check_max_dygraph_padding(place)
+            self.check_avg_divisor(place)
+
+
+class TestPool2dError_API(unittest.TestCase):
+    def test_error_api(self):
+        def run1():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0]]
+                res_pd = max_pool2d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run1)
+
+        def run2():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0]]
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run2)
+
+        def run3():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "padding"
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run3)
+
+        def run3_avg():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "padding"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run3_avg)
+
+        def run4():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run4)
+
+        def run4_avg():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run4_avg)
+
+        def run5():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "padding"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run5)
+
+        def run6():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run6)
+
+        def run7():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=False,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run7)
+
+        def run8():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=False,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index a9fdcd55f74cd53824016765fe82a03190f23f89..a12a328b653b26dac31b11839d02e867a012c4bc 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -517,6 +517,19 @@ class TestAvgPoolAdaptive(TestCase1):
         self.adaptive = True
 
 
+class TestAvgPoolAdaptiveAsyOutSize(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_shape(self):
+        self.shape = [8, 3, 6, 6]
+
+    def init_test_case(self):
+        self.ksize = [2, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0, 0, 0]
+
+
 #-------test pool2d with asymmetric padding-----
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc078e9aae7aafe55e937b80270dd012fd64ff70
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -0,0 +1,341 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.nn.functional import avg_pool3d, max_pool3d
+from test_pool3d_op import adaptive_start_index, adaptive_end_index, pool3D_forward_naive
+
+
+class TestPool3d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 32, 32, 32], dtype="float32")
+            result = avg_pool3d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='avg')
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool3d(input, kernel_size=2, stride=2, padding="SAME")
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='avg',
+                padding_algorithm="SAME")
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+                kernel_size=2, stride=None, padding="SAME")
+            result = avg_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 32, 32, 32], dtype="float32")
+            result = max_pool3d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max')
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool3d(input, kernel_size=2, stride=2, padding=0)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max')
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=None, padding=0)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_stride_is_none(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result, indices = max_pool3d(
+                input,
+                kernel_size=2,
+                stride=None,
+                padding="SAME",
+                return_indices=True)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max',
+                padding_algorithm="SAME")
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_padding(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            padding = [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]
+            result = max_pool3d(input, kernel_size=2, stride=2, padding=padding)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max')
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            padding = [0, 0, 0, 0, 0, 0]
+            result = max_pool3d(input, kernel_size=2, stride=2, padding=padding)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_divisor(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            padding = 0
+            result = avg_pool3d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=padding,
+                divisor_override=8)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='avg')
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+                kernel_size=2, stride=2, padding=0)
+            result = avg_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            padding = [0, 0, 0, 0, 0, 0]
+            result = avg_pool3d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=padding,
+                divisor_override=8)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def test_pool3d(self):
+        for place in self.places:
+
+            self.check_max_dygraph_results(place)
+            self.check_avg_dygraph_results(place)
+            self.check_max_static_results(place)
+            self.check_avg_static_results(place)
+            self.check_max_dygraph_stride_is_none(place)
+            self.check_max_dygraph_padding(place)
+            self.check_avg_divisor(place)
+
+
+class TestPool3dError_API(unittest.TestCase):
+    def test_error_api(self):
+        def run1():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0], [0, 0]]
+                res_pd = avg_pool3d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run1)
+
+        def run2():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0], [0, 0]]
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NCDHW')
+
+        self.assertRaises(ValueError, run2)
+
+        def run3():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0], [0, 0]]
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NDHWC')
+
+        self.assertRaises(ValueError, run3)
+
+        def run4():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run4)
+
+        def run5():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run5)
+
+        def run6():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding="padding",
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run6)
+
+        def run7():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding="padding",
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run7)
+
+        def run8():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding="VALID",
+                    ceil_mode=True,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run8)
+
+        def run9():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding="VALID",
+                    ceil_mode=True,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index ade7e9f50fd27a3bd4084a628eff445e0d81db0d..3d139e9b90c10e352599d506fb3ca837472348f5 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -453,6 +453,18 @@ class TestAvgPoolAdaptive(TestCase1):
         self.adaptive = True
 
 
+class TestAvgPoolAdaptiveAsyOutSize(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_shape(self):
+        self.shape = [8, 3, 2, 4, 4]
+
+    def init_test_case(self):
+        self.ksize = [2, 2, 3]
+        self.strides = [1, 1, 1]
+
+
 #-------test pool3d with asymmetric padding------
 class TestPool3d_Op_AsyPadding(TestPool3d_Op):
     def init_test_case(self):
diff --git a/python/paddle/fluid/tests/unittests/test_pow.py b/python/paddle/fluid/tests/unittests/test_pow.py
new file mode 100755
index 0000000000000000000000000000000000000000..0764cb580e40d115d8703278380a9ced12e24201
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pow.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.tensor as tensor
+import paddle.fluid as fluid
+from paddle.static import Program, program_guard
+import numpy as np
+import unittest
+
+DYNAMIC = 1
+STATIC = 2
+
+
+def _run_power(mode, x, y):
+    # dynamic mode
+    if mode == DYNAMIC:
+        paddle.disable_static()
+        # y is scalar
+        if isinstance(y, (int, float)):
+            x_ = paddle.to_tensor(x)
+            y_ = y
+            res = paddle.pow(x_, y_)
+            return res.numpy()
+        # y is tensor
+        else:
+            x_ = paddle.to_tensor(x)
+            y_ = paddle.to_tensor(y)
+            res = paddle.pow(x_, y_)
+            return res.numpy()
+    # static mode
+    elif mode == STATIC:
+        paddle.enable_static()
+        # y is scalar
+        if isinstance(y, (int, float)):
+            with program_guard(Program(), Program()):
+                x_ = paddle.static.data(name="x", shape=x.shape, dtype=x.dtype)
+                y_ = y
+                res = paddle.pow(x_, y_)
+                place = fluid.CPUPlace()
+                exe = fluid.Executor(place)
+                outs = exe.run(feed={'x': x}, fetch_list=[res])
+                return outs[0]
+        # y is tensor
+        else:
+            with program_guard(Program(), Program()):
+                x_ = paddle.static.data(name="x", shape=x.shape, dtype=x.dtype)
+                y_ = paddle.static.data(name="y", shape=y.shape, dtype=y.dtype)
+                res = paddle.pow(x_, y_)
+                place = fluid.CPUPlace()
+                exe = fluid.Executor(place)
+                outs = exe.run(feed={'x': x, 'y': y}, fetch_list=[res])
+                return outs[0]
+
+
+class TestPowerAPI(unittest.TestCase):
+    """TestPowerAPI."""
+
+    def test_power(self):
+        """test_power."""
+        np.random.seed(7)
+        # test 1-d float tensor ** float scalar
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = np.random.rand() * 10
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d float tensor ** int scalar
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = int(np.random.rand() * 10)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        x = (np.random.rand(*dims) * 10).astype(np.int64)
+        y = int(np.random.rand() * 10)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d float tensor ** 1-d float tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(*dims) * 10).astype(np.float64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d float tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(*dims) * 10).astype(np.int64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d float tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int64)
+        y = (np.random.rand(*dims) * 10).astype(np.float64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int64)
+        y = (np.random.rand(*dims) * 10).astype(np.int64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int32)
+        y = (np.random.rand(*dims) * 10).astype(np.int32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int64)
+        y = (np.random.rand(*dims) * 10).astype(np.int32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int32)
+        y = (np.random.rand(*dims) * 10).astype(np.int64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float32)
+        y = (np.random.rand(*dims) * 10).astype(np.float32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(*dims) * 10).astype(np.float32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(*dims) * 10).astype(np.int32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float32)
+        y = (np.random.rand(*dims) * 10).astype(np.int64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test broadcast
+        dims = (np.random.randint(1, 10), np.random.randint(5, 10),
+                np.random.randint(5, 10))
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(dims[-1]) * 10).astype(np.float64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+
+class TestPowerError(unittest.TestCase):
+    """TestPowerError."""
+
+    def test_errors(self):
+        """test_errors."""
+        np.random.seed(7)
+
+        # test dynamic computation graph: inputs must be broadcastable
+        dims = (np.random.randint(1, 10), np.random.randint(5, 10),
+                np.random.randint(5, 10))
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(dims[-1] + 1) * 10).astype(np.float64)
+        self.assertRaises(fluid.core.EnforceNotMet, _run_power, DYNAMIC, x, y)
+        self.assertRaises(fluid.core.EnforceNotMet, _run_power, STATIC, x, y)
+
+        # test dynamic computation graph: inputs must be broadcastable
+        dims = (np.random.randint(1, 10), np.random.randint(5, 10),
+                np.random.randint(5, 10))
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(dims[-1] + 1) * 10).astype(np.int8)
+        self.assertRaises(TypeError, paddle.pow, x, y)
+
+        # test 1-d float tensor ** int string
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = int(np.random.rand() * 10)
+        self.assertRaises(TypeError, paddle.pow, x, str(y))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index 0a38bd277bfd1d157c944f5ae28810eb7d5f1dae..16388ff8f5f042326ac5705a5f69919f4f8061c2 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -18,23 +18,134 @@ import unittest
 import numpy as np
 import paddle.fluid as fluid
 import six
-import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
 from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.nn.functional as F
+
+
+def ref_prelu(x, weight):
+    x_t = x.copy()
+    weight = weight.reshape(1, -1, 1, 1)
+    neg_indices = x <= 0
+    assert x.shape == neg_indices.shape
+    x_t[neg_indices] = (x_t * weight)[neg_indices]
+    return (x_t, )
+
+
+def ref_prelu_nn(x, num_parameters, init):
+    weight_np = np.full((num_parameters), init)
+    return ref_prelu(x, weight_np)
 
 
-class TestPReluOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program()):
+class TestFunctionalPReluAPI(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        self.x_np = np.random.uniform(-1., 1., [1, 2, 3, 4]).astype('float32')
+        self.weight_np_0 = np.random.randn(1).astype('float32')
+        self.weight_np_1 = np.random.randn(self.x_np.shape[1]).astype('float32')
+
+    def static_check(self, weight_np):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, 'float32')
+            weight = paddle.data('Alpha', weight_np.shape, 'float32')
+            out = F.prelu(x, weight)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np,
+                                'Alpha': weight_np},
+                          fetch_list=[out])
+        out_ref = ref_prelu(self.x_np, weight_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def dygraph_check(self, weight_np):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        weight = paddle.to_tensor(weight_np)
+        out = F.prelu(x, weight)
+        out_ref = ref_prelu(self.x_np, weight_np)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+        paddle.enable_static()
+
+    def test_static_api(self):
+        self.static_check(self.weight_np_0)
+        self.static_check(self.weight_np_1)
+
+    def test_dygraph_api(self):
+        self.dygraph_check(self.weight_np_0)
+        self.dygraph_check(self.weight_np_1)
+
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            weight_fp32 = paddle.data(
+                name='weight_fp32', shape=[1], dtype='float32')
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.prelu, 0.1, 'all')
+            self.assertRaises(TypeError, F.prelu, x=1, weight=weight_fp32)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.prelu, x_int32, 'all')
-            # support the input dtype is float32
-            x_fp16 = fluid.layers.data(
-                name='x_fp16', shape=[12, 10], dtype='float32')
-            fluid.layers.prelu(x_fp16, 'all')
+            x_int32 = paddle.data(name='x_int32', shape=[2, 3], dtype='int32')
+            self.assertRaises(TypeError, F.prelu, x=x_int32, weight=weight_fp32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[2, 3], dtype='float16')
+            F.prelu(x=x_fp16, weight=weight_fp32)
+
+
+class TestNNPReluAPI(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        self.x_np = np.ones([1, 2, 3, 4]).astype('float32')
+
+    def test_static_api(self):
+        startup_program = paddle.static.Program()
+        train_program = paddle.static.Program()
+        with paddle.static.program_guard(train_program, startup_program):
+            x = paddle.data(name='X', shape=self.x_np.shape, dtype='float32')
+            m = paddle.nn.PReLU()
+            out = m(x)
+            exe = paddle.static.Executor(self.place)
+            exe.run(startup_program)
+            res = exe.run(train_program,
+                          feed={'X': self.x_np},
+                          fetch_list=[out])
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.25)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU()
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.25)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU(num_parameters=self.x_np.shape[1])
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, self.x_np.shape[1], 0.25)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU(init=0.5)
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.5)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU(weight_attr=fluid.ParamAttr(name="weight"))
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.25)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU(weight_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(0.5)))
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.5)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        paddle.enable_static()
 
 
 class PReluTest(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_prod_op.py b/python/paddle/fluid/tests/unittests/test_prod_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..158683907253e2ebc5adab6799c75ffd914df1c7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_prod_op.py
@@ -0,0 +1,132 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import unittest
+import numpy as np
+
+
+class TestProdOp(unittest.TestCase):
+    def setUp(self):
+        self.input = np.random.random(size=(10, 10, 5)).astype(np.float32)
+
+    def run_imperative(self):
+        input = paddle.to_tensor(self.input)
+        dy_result = paddle.prod(input)
+        expected_result = np.prod(self.input)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=1)
+        expected_result = np.prod(self.input, axis=1)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=-1)
+        expected_result = np.prod(self.input, axis=-1)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=[0, 1])
+        expected_result = np.prod(self.input, axis=(0, 1))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=1, keepdim=True)
+        expected_result = np.prod(self.input, axis=1, keepdims=True)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=1, dtype='int64')
+        expected_result = np.prod(self.input, axis=1, dtype=np.int64)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=1, keepdim=True, dtype='int64')
+        expected_result = np.prod(
+            self.input, axis=1, keepdims=True, dtype=np.int64)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+    def run_static(self, use_gpu=False):
+        input = paddle.data(name='input', shape=[10, 10, 5], dtype='float32')
+        result0 = paddle.prod(input)
+        result1 = paddle.prod(input, axis=1)
+        result2 = paddle.prod(input, axis=-1)
+        result3 = paddle.prod(input, axis=[0, 1])
+        result4 = paddle.prod(input, axis=1, keepdim=True)
+        result5 = paddle.prod(input, axis=1, dtype='int64')
+        result6 = paddle.prod(input, axis=1, keepdim=True, dtype='int64')
+
+        place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+        static_result = exe.run(feed={"input": self.input},
+                                fetch_list=[
+                                    result0, result1, result2, result3, result4,
+                                    result5, result6
+                                ])
+
+        expected_result = np.prod(self.input)
+        self.assertTrue(np.allclose(static_result[0], expected_result))
+        expected_result = np.prod(self.input, axis=1)
+        self.assertTrue(np.allclose(static_result[1], expected_result))
+        expected_result = np.prod(self.input, axis=-1)
+        self.assertTrue(np.allclose(static_result[2], expected_result))
+        expected_result = np.prod(self.input, axis=(0, 1))
+        self.assertTrue(np.allclose(static_result[3], expected_result))
+        expected_result = np.prod(self.input, axis=1, keepdims=True)
+        self.assertTrue(np.allclose(static_result[4], expected_result))
+        expected_result = np.prod(self.input, axis=1, dtype=np.int64)
+        self.assertTrue(np.allclose(static_result[5], expected_result))
+        expected_result = np.prod(
+            self.input, axis=1, keepdims=True, dtype=np.int64)
+        self.assertTrue(np.allclose(static_result[6], expected_result))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not paddle.fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            self.run_static(use_gpu=True)
+
+
+class TestProdOpError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.data(name='x', shape=[2, 2, 4], dtype='float32')
+            bool_x = paddle.data(name='bool_x', shape=[2, 2, 4], dtype='bool')
+            # The argument x shoule be a Tensor
+            self.assertRaises(TypeError, paddle.prod, [1])
+
+            # The data type of x should be float32, float64, int32, int64
+            self.assertRaises(TypeError, paddle.prod, bool_x)
+
+            # The argument axis's type shoule be int ,list or tuple
+            self.assertRaises(TypeError, paddle.prod, x, 1.5)
+
+            # The argument dtype of prod_op should be float32, float64, int32 or int64.
+            self.assertRaises(TypeError, paddle.prod, x, 'bool')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
index 6045f2d713627cedfe169b9e066222904244311a..32d8f73552f71dd1066e19b70f1f2e4b8628950a 100644
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -147,10 +147,8 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
 
     with fluid.program_guard(fluid.Program(), fluid.Program()):
         with fluid.scope_guard(fluid.core.Scope()):
-            fluid.default_main_program().random_seed = 1
-            fluid.default_startup_program().random_seed = 1
+            gen = paddle.manual_seed(1)
             np.random.seed(1)
-
             img = fluid.layers.data(name='image', shape=[784], dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             loss = simple_fc_net(img, label, use_py_func_op)
@@ -189,17 +187,17 @@ class TestPyFuncOpUseExecutor(unittest.TestCase):
         self.use_parallel_executor = False
 
     def test_loss_diff(self):
-        losses = []
         for use_cuda in [True, False]:
+            losses = []
             for use_py_func_op in [True, False]:
                 L = test_main(use_cuda, use_py_func_op,
                               self.use_parallel_executor)
                 if L is not None:
                     losses.append(L)
 
-        for idx in six.moves.range(len(losses) - 1):
-            max_diff = np.max(np.abs(losses[idx] - losses[0]))
-            self.assertAlmostEqual(max_diff, 0, delta=1e-3)
+                for idx in six.moves.range(len(losses) - 1):
+                    max_diff = np.max(np.abs(losses[idx] - losses[0]))
+                    self.assertAlmostEqual(max_diff, 0, delta=1e-3)
 
 
 class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor):
diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py
index 715d66aa3332cef649f867052400b9769bacd979..88b07f5df83f8f967f8ba76e78b37ecfb2c54276 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
@@ -125,7 +125,7 @@ class TestRandintAPI(unittest.TestCase):
             out4 = paddle.randint(
                 low=-100, high=100, shape=[dim_1, 5, dim_2], dtype='int32')
             # shape is a tensor and dtype is 'float64'
-            var_shape = paddle.nn.data(
+            var_shape = paddle.static.data(
                 name='var_shape', shape=[2], dtype="int64")
             out5 = paddle.randint(
                 low=1, high=1000, shape=var_shape, dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/test_randn_op.py b/python/paddle/fluid/tests/unittests/test_randn_op.py
index 8b560f18f9f7bc614e38c1245f48398f808525ed..9d2c03f3bba914d8f6b06b54ce0e19c168edb9e3 100644
--- a/python/paddle/fluid/tests/unittests/test_randn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randn_op.py
@@ -34,7 +34,7 @@ class TestRandnOp(unittest.TestCase):
             dim_2 = paddle.fill_constant([1], "int32", 50)
             x3 = paddle.randn([dim_1, dim_2, 784])
 
-            var_shape = paddle.nn.data('X', [2], 'int32')
+            var_shape = paddle.static.data('X', [2], 'int32')
             x4 = paddle.randn(var_shape)
 
         place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
diff --git a/python/paddle/fluid/tests/unittests/test_random_seed.py b/python/paddle/fluid/tests/unittests/test_random_seed.py
index 31120a73042c98a921da3261116d3872439d6c5a..343508bf619b6a7328016464794221f7284435b2 100644
--- a/python/paddle/fluid/tests/unittests/test_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_random_seed.py
@@ -26,27 +26,31 @@ import paddle.fluid.core as core
 
 
 class TestGeneratorSeed(unittest.TestCase):
-    """
-    Test cases for cpu generator seed.
-    """
+    #     """
+    #     Test cases for cpu generator seed.
+    #     """
 
     def test_generator_uniform_random_dygraph(self):
         """Test Generator seed."""
-        gen = generator.Generator()
 
         fluid.enable_dygraph()
 
-        gen.manual_seed(12312321111)
+        gen = paddle.manual_seed(12312321111)
         x = fluid.layers.uniform_random([10], dtype="float32", min=0.0, max=1.0)
+
         st1 = gen.get_state()
         x1 = fluid.layers.uniform_random(
             [10], dtype="float32", min=0.0, max=1.0)
+
         gen.set_state(st1)
+        print(gen.get_state())
         x2 = fluid.layers.uniform_random(
             [10], dtype="float32", min=0.0, max=1.0)
-        gen.manual_seed(12312321111)
+
+        paddle.manual_seed(12312321111)
         x3 = fluid.layers.uniform_random(
             [10], dtype="float32", min=0.0, max=1.0)
+
         x_np = x.numpy()
         x1_np = x1.numpy()
         x2_np = x2.numpy()
@@ -57,11 +61,9 @@ class TestGeneratorSeed(unittest.TestCase):
             self.assertTrue(np.allclose(x_np, x3_np))
 
     def test_generator_uniform_random_static(self):
-
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        gen = paddle.manual_seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -92,13 +94,172 @@ class TestGeneratorSeed(unittest.TestCase):
                 self.assertTrue(np.allclose(out1_res2, out2_res2))
                 self.assertTrue(not np.allclose(out1_res2, out1_res1))
 
+    def test_gen_dropout_dygraph(self):
+        fluid.enable_dygraph()
+
+        gen = paddle.manual_seed(111111111)
+        st = gen.get_state()
+        # x = np.arange(1,101).reshape(2,50).astype("float32")
+        x = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        y = fluid.layers.dropout(x, 0.5)
+        gen.manual_seed(111111111)
+        #gen.set_state(st)
+        x1 = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        y1 = fluid.layers.dropout(x1, 0.5)
+        y_np = y.numpy()
+        y1_np = y1.numpy()
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> dropout dygraph >>>>>>>")
+            self.assertTrue(np.allclose(y_np, y1_np))
+
+    def test_gen_dropout_static(self):
+        fluid.disable_dygraph()
+
+        gen = paddle.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            x_1 = fluid.layers.uniform_random(shape=[2, 10])
+            y_1 = fluid.layers.dropout(x_1, 0.5)
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program, feed={}, fetch_list=[y_1])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program, feed={}, fetch_list=[y_1])
+        out1_np = np.array(out1[0])
+        out2_np = np.array(out2[0])
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> dropout static >>>>>>>")
+            self.assertTrue(np.allclose(out1_np, out2_np))
+
+    def test_generator_gaussian_random_dygraph(self):
+        """Test Generator seed."""
+        fluid.enable_dygraph()
+
+        gen = paddle.manual_seed(12312321111)
+        x = fluid.layers.gaussian_random([10], dtype="float32")
+        st1 = gen.get_state()
+        x1 = fluid.layers.gaussian_random([10], dtype="float32")
+        gen.set_state(st1)
+        x2 = fluid.layers.gaussian_random([10], dtype="float32")
+        gen.manual_seed(12312321111)
+        x3 = fluid.layers.gaussian_random([10], dtype="float32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> gaussian random dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_gaussian_random_static(self):
+        fluid.disable_dygraph()
+
+        gen = paddle.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = fluid.layers.gaussian_random(shape=[3, 4])
+            result_2 = fluid.layers.gaussian_random(shape=[3, 4])
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                print(">>>>>>> gaussian random static >>>>>>>")
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
     def test_generator_randint_dygraph(self):
         """Test Generator seed."""
         gen = generator.Generator()
 
         fluid.enable_dygraph()
 
+        gen = paddle.manual_seed(12312321111)
+        x = paddle.randint(low=10, shape=[10], dtype="int32")
+        st1 = gen.get_state()
+        x1 = paddle.randint(low=10, shape=[10], dtype="int32")
+        gen.set_state(st1)
+        x2 = paddle.randint(low=10, shape=[10], dtype="int32")
         gen.manual_seed(12312321111)
+        x3 = paddle.randint(low=10, shape=[10], dtype="int32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> randint dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_uniform_random_static(self):
+        fluid.disable_dygraph()
+
+        gen = paddle.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = fluid.layers.uniform_random(shape=[3, 4])
+            result_2 = fluid.layers.uniform_random(shape=[3, 4])
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_generator_randint_dygraph(self):
+        """Test Generator seed."""
+        fluid.enable_dygraph()
+
+        gen = paddle.manual_seed(12312321111)
         x = paddle.randint(low=1)
         st1 = gen.get_state()
         x1 = paddle.randint(low=1)
@@ -114,6 +275,220 @@ class TestGeneratorSeed(unittest.TestCase):
             self.assertTrue(np.allclose(x1_np, x2_np))
             self.assertTrue(np.allclose(x_np, x3_np))
 
+    def test_generator_ranint_static(self):
+        fluid.disable_dygraph()
+
+        gen = paddle.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = paddle.randint(low=10, shape=[3, 4])
+            result_2 = paddle.randint(low=10, shape=[3, 4])
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                print(">>>>>>> randint static >>>>>>>")
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_generator_randperm_dygraph(self):
+        """Test Generator seed."""
+
+        fluid.enable_dygraph()
+
+        gen = paddle.manual_seed(12312321111)
+        x = paddle.randperm(10)
+        st1 = gen.get_state()
+        x1 = paddle.randperm(10)
+        gen.set_state(st1)
+        x2 = paddle.randperm(10)
+        gen.manual_seed(12312321111)
+        x3 = paddle.randperm(10)
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> randperm dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_randperm_static(self):
+
+        fluid.disable_dygraph()
+
+        paddle.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = paddle.randperm(10)
+            result_2 = paddle.randperm(10)
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            paddle.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                print(">>>>>>> randperm static >>>>>>>")
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_generator_sampling_id_dygraph(self):
+        """Test Generator seed."""
+        gen = paddle.manual_seed(12312321111)
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(12312321111)
+        x = fluid.layers.uniform_random(
+            [10, 10], dtype="float32", min=0.0, max=1.0)
+        y = fluid.layers.sampling_id(x)
+
+        st1 = gen.get_state()
+        x1 = fluid.layers.uniform_random(
+            [10, 10], dtype="float32", min=0.0, max=1.0)
+        y1 = fluid.layers.sampling_id(x)
+
+        gen.set_state(st1)
+        x2 = fluid.layers.uniform_random(
+            [10, 10], dtype="float32", min=0.0, max=1.0)
+        y2 = fluid.layers.sampling_id(x)
+
+        gen.manual_seed(12312321111)
+        x3 = fluid.layers.uniform_random(
+            [10, 10], dtype="float32", min=0.0, max=1.0)
+        y3 = fluid.layers.sampling_id(x)
+
+        x_np = y.numpy()
+        x1_np = y1.numpy()
+        x2_np = y2.numpy()
+        x3_np = y3.numpy()
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> sampling id dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_randperm_static(self):
+
+        fluid.disable_dygraph()
+
+        paddle.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            x = fluid.layers.uniform_random(shape=[10, 10])
+            result_1 = fluid.layers.sampling_id(x)
+            result_2 = fluid.layers.sampling_id(x)
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            paddle.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                print(">>>>>>> sampling id static >>>>>>>")
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_gen_TruncatedNormal_initializer(self):
+        fluid.disable_dygraph()
+
+        gen = paddle.manual_seed(123123143)
+        cur_state = gen.get_state()
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            x = fluid.layers.uniform_random(shape=[2, 10])
+            result_1 = fluid.layers.fc(
+                input=x,
+                size=10,
+                param_attr=fluid.initializer.TruncatedNormal(
+                    loc=0.0, scale=2.0))
+            result_2 = fluid.layers.fc(
+                input=x,
+                size=10,
+                param_attr=fluid.initializer.TruncatedNormal(
+                    loc=0.0, scale=2.0))
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+        gen.manual_seed(123123143)
+        with fluid.program_guard(train_program, startup_program):
+            exe.run(startup_program)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+        out1_res1 = np.array(out1[0])
+        out1_res2 = np.array(out1[1])
+        out2_res1 = np.array(out2[0])
+        out2_res2 = np.array(out2[1])
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> sampling id static >>>>>>>")
+            self.assertTrue(np.allclose(out1_res1, out2_res1))
+            self.assertTrue(np.allclose(out1_res2, out2_res2))
+            self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 0531da2b06ec37fd60389cd2abb85822ebc9d0f9..cf35f9dbcdaaae1357ccdfd6b5cba85ac98d2037 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -67,22 +67,6 @@ class TestSumOp6D(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestMeanOp(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
-        self.attrs = {'dim': [1]}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
@@ -318,21 +302,6 @@ class TestReduceAll(Test1DReduce):
         self.outputs = {'Out': self.inputs['X'].sum()}
 
 
-## reduction in multi dims
-class TestReduceMeanOpMultiAxises(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
-        self.attrs = {'dim': [1, 2]}
-        self.outputs = {'Out': self.inputs['X'].mean(axis=(1, 2))}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
@@ -420,40 +389,6 @@ class TestReduceSumWithNumelOne(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestReduceMeanWithDimOne(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((100, 1, 1)).astype("float64")}
-        self.attrs = {'dim': [1], 'keep_dim': False}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=False)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestReduceMeanWithNumelOne(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((100, 1)).astype("float64")}
-        self.attrs = {'dim': [1], 'keep_dim': True}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=True)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
 class TestReduceAll(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
@@ -536,18 +471,6 @@ class TestReduceSumOpError(unittest.TestCase):
             self.assertRaises(TypeError, fluid.layers.reduce_sum, x2)
 
 
-class TestReduceMeanOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            # The input type of reduce_mean_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            self.assertRaises(TypeError, fluid.layers.reduce_mean, x1)
-            # The input dtype of reduce_mean_op  must be float32 or float64 or int32 or int64.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype="uint8")
-            self.assertRaises(TypeError, fluid.layers.reduce_mean, x2)
-
-
 class API_TestSumOpError(unittest.TestCase):
     def test_errors(self):
         def test_dtype1():
@@ -580,10 +503,10 @@ class API_TestSumOpError(unittest.TestCase):
 
 
 class API_TestSumOp(unittest.TestCase):
-    def test_1(self):
+    def test_static(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.data("data", shape=[10, 10], dtype="float32")
-            result_sum = paddle.sum(input=data, dim=1, dtype="float64")
+            result_sum = paddle.sum(x=data, axis=1, dtype="float64")
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input_data = np.random.rand(10, 10).astype(np.float32)
@@ -593,7 +516,7 @@ class API_TestSumOp(unittest.TestCase):
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(input=data, dim=1, dtype="int64")
+            result_sum = paddle.sum(x=data, axis=1, dtype="int64")
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
@@ -603,7 +526,7 @@ class API_TestSumOp(unittest.TestCase):
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(input=data, dim=1)
+            result_sum = paddle.sum(x=data, axis=1)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
@@ -612,39 +535,41 @@ class API_TestSumOp(unittest.TestCase):
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(input=data, dim=1)
+            result_sum = paddle.sum(x=data, axis=1)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
             res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
         self.assertEqual((res == np.sum(input_data, axis=1)).all(), True)
 
-        with fluid.dygraph.guard():
-            np_x = np.array([10, 10]).astype('float64')
-            x = fluid.dygraph.to_variable(np_x)
-            z = paddle.sum(x, dim=0)
-            np_z = z.numpy()
-            z_expected = np.array(np.sum(np_x, axis=0))
-        self.assertEqual((np_z == z_expected).all(), True)
-
-
-class API_TestReduceMeanOp(unittest.TestCase):
-    def test_static(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data("x", shape=[10, 10], dtype="float32")
-            out = fluid.layers.reduce_mean(input=x, dim=1)
+            input_data = np.random.randint(10, size=(5, 5, 5)).astype(np.int32)
+            data = fluid.data("data", shape=[5, 5, 5], dtype="int32")
+            sum1 = paddle.sum(x=data, axis=[0, 1])
+            sum2 = paddle.sum(x=data, axis=())
+
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
-            x_np = np.random.rand(10, 10).astype(np.float32)
-            res = exe.run(feed={"x": x_np}, fetch_list=[out])
-        self.assertEqual(np.allclose(res[0], np.mean(x_np, axis=1)), True)
+            res1, res2 = exe.run(feed={"data": input_data},
+                                 fetch_list=[sum1, sum2])
+
+        self.assertEqual((res1 == np.sum(input_data, axis=(0, 1))).all(), True)
+        self.assertEqual(
+            (res2 == np.sum(input_data, axis=(0, 1, 2))).all(), True)
 
     def test_dygraph(self):
+        np_x = np.random.random([2, 3, 4]).astype('int32')
         with fluid.dygraph.guard():
-            x_np = np.random.rand(10, 10).astype(np.float32)
-            x = fluid.dygraph.to_variable(x_np)
-            out = fluid.layers.reduce_mean(input=x, dim=1)
-        self.assertEqual(np.allclose(out.numpy(), np.mean(x_np, axis=1)), True)
+            x = fluid.dygraph.to_variable(np_x)
+            out0 = paddle.sum(x).numpy()
+            out1 = paddle.sum(x, axis=0).numpy()
+            out2 = paddle.sum(x, axis=(0, 1)).numpy()
+            out3 = paddle.sum(x, axis=(0, 1, 2)).numpy()
+
+        self.assertTrue((out0 == np.sum(np_x, axis=(0, 1, 2))).all())
+        self.assertTrue((out1 == np.sum(np_x, axis=0)).all())
+        self.assertTrue((out2 == np.sum(np_x, axis=(0, 1))).all())
+        self.assertTrue((out3 == np.sum(np_x, axis=(0, 1, 2))).all())
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index 58b407f8bc1f41301a068f0b85f4c4e9860a45ff..44087c5421a5ee66273ef35b935926d42dcc37ae 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -169,9 +169,10 @@ class TestRegularizer(unittest.TestCase):
         return param_sum
 
     def check_l2decay_regularizer(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
         with self.scope_prog_guard(
                 main_prog=main_prog, startup_prog=startup_prog):
             data = fluid.layers.data(
@@ -188,9 +189,11 @@ class TestRegularizer(unittest.TestCase):
         return param_sum
 
     def check_l2decay(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
+
         with self.scope_prog_guard(
                 main_prog=main_prog, startup_prog=startup_prog):
             data = fluid.layers.data(
@@ -243,7 +246,8 @@ class TestRegularizer(unittest.TestCase):
         with fluid.dygraph.guard():
             input = fluid.dygraph.to_variable(
                 np.random.randn(3, 5).astype('float32'))
-            fluid.default_main_program().random_seed = 1
+            paddle.manual_seed(1)
+            paddle.framework.random._manual_program_seed(1)
 
             linear1 = fluid.dygraph.Linear(
                 5, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 3dfd9023f5af30ff289c4dc55a0c275402bc3067..275f9d21f9f8eca653a030bfe5c74071397f33c1 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
 
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
@@ -227,35 +228,43 @@ class TestReshapeUint8Op(TestReshapeInt8Op):
 
 # Test python API
 class TestReshapeAPI(unittest.TestCase):
-    # situation 1: have shape( list, no tensor), no actual shape(Tensor)
-    def test_1(self):
+    def _set_paddle_api(self):
+        self.fill_constant = paddle.fill_constant
+        self.data = paddle.data
+        self.reshape = paddle.reshape
+        self.to_tensor = paddle.to_tensor
+
+    def _set_fluid_api(self):
+        self.fill_constant = fluid.layers.fill_constant
+        self.data = fluid.data
+        self.reshape = fluid.layers.reshape
+
+    def _test_api(self):
         input = np.random.random([2, 25]).astype("float32")
         shape = [2, 5, 5]
-        positive_five = fluid.layers.fill_constant([1], "int32", 5)
-        x = fluid.layers.data(
-            name="x", shape=[2, 25], append_batch_size=False, dtype="float32")
+        main_prog = Program()
+        with program_guard(main_prog, Program()):
+            positive_five = self.fill_constant([1], "int32", 5)
+            x = self.data(name="x", shape=[2, 25], dtype="float32")
 
-        actual_shape = fluid.layers.data(
-            name="shape",
-            shape=[1, 3],
-            append_batch_size=False,
-            dtype="float32")
+            actual_shape = self.data(name="shape", shape=[3], dtype="int32")
 
-        # situation 1: have shape( list, no tensor), no actual shape(Tensor)
-        out_1 = fluid.layers.reshape(x, shape)
+            # situation 1: have shape( list, no tensor), no actual shape(Tensor)
+            out_1 = self.reshape(x, shape)
 
-        # situation 2: have shape(list, no tensor), have actual shape(Tensor)
-        out_2 = fluid.layers.reshape(x, shape=shape, actual_shape=actual_shape)
+            # situation 2: have shape(list, no tensor), have actual shape(Tensor)
+            out_2 = fluid.layers.reshape(
+                x, shape=shape, actual_shape=actual_shape)
 
-        # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
-        out_3 = fluid.layers.reshape(x, shape=[positive_five, 10])
+            # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
+            out_3 = self.reshape(x, shape=[positive_five, 10])
 
-        # Situation 4: have shape(Tensor), no actual shape(Tensor)
-        out_4 = fluid.layers.reshape(x, shape=actual_shape)
+            # Situation 4: have shape(Tensor), no actual shape(Tensor)
+            out_4 = self.reshape(x, shape=actual_shape)
 
         exe = fluid.Executor(place=fluid.CPUPlace())
         res_1, res_2, res_3, res_4 = exe.run(
-            fluid.default_main_program(),
+            main_prog,
             feed={"x": input,
                   "shape": np.array([2, 5, 5]).astype("int32")},
             fetch_list=[out_1, out_2, out_3, out_4])
@@ -265,76 +274,108 @@ class TestReshapeAPI(unittest.TestCase):
         assert np.array_equal(res_3, input.reshape([5, 10]))
         assert np.array_equal(res_4, input.reshape(shape))
 
+    def test_paddle_api(self):
+        self._set_paddle_api()
+        self._test_api()
+
+    def test_fluid_api(self):
+        self._set_fluid_api()
+        self._test_api()
+
+    def test_imperative(self):
+        self._set_paddle_api()
+        input = np.random.random([2, 25]).astype("float32")
+        shape = [2, 5, 5]
+        with fluid.dygraph.guard():
+            x = self.to_tensor(input)
+            positive_five = self.fill_constant([1], "int32", 5)
+
+            out_1 = self.reshape(x, shape)
+
+            out_2 = self.reshape(x, shape=[positive_five, 10])
+
+            shape_tensor = self.to_tensor(np.array([2, 5, 5]).astype("int32"))
+            out_3 = self.reshape(x, shape=shape_tensor)
+
+        assert np.array_equal(out_1.numpy(), input.reshape(shape))
+        assert np.array_equal(out_2.numpy(), input.reshape([5, 10]))
+        assert np.array_equal(out_3.numpy(), input.reshape(shape))
+
 
 # Test Input Error
 class TestReshapeOpError(unittest.TestCase):
-    def test_errors(self):
+    def _set_paddle_api(self):
+        self.data = paddle.data
+        self.reshape = paddle.reshape
+
+    def _set_fluid_api(self):
+        self.data = fluid.data
+        self.reshape = fluid.layers.reshape
+
+    def _test_errors(self):
         with program_guard(Program(), Program()):
             # The x type of reshape_op must be Variable.
             def test_x_type():
                 x1 = fluid.create_lod_tensor(
                     np.array([[-1]]), [[1]], fluid.CPUPlace())
-                fluid.layers.reshape(x1, shape=[1])
+                self.reshape(x1, shape=[1])
 
             self.assertRaises(TypeError, test_x_type)
 
             # The x dtype of reshape_op must be float16, float32, float64, int32 or int64.
             def test_x_dtype():
-                x2 = fluid.layers.data(
-                    name="x2",
-                    shape=[2, 25],
-                    append_batch_size=False,
-                    dtype="bool")
-                fluid.layers.reshape(x2, shape=[2, 5, 5])
+                x2 = self.data(name="x2", shape=[2, 25], dtype="bool")
+                self.reshape(x2, shape=[2, 5, 5])
 
             self.assertRaises(TypeError, test_x_dtype)
 
             def test_x_dtype_float16():
-                x_float16 = fluid.layers.data(
-                    name="x_float16",
-                    shape=[2, 25],
-                    append_batch_size=False,
-                    dtype="float16")
-                fluid.layers.reshape(x_float16, shape=[2, 5, 5])
+                x_float16 = self.data(
+                    name="x_float16", shape=[2, 25], dtype="float16")
+                self.reshape(x_float16, shape=[2, 5, 5])
 
             test_x_dtype_float16()
 
-            x3 = fluid.layers.data(
-                name="x3",
-                shape=[2, 25],
-                append_batch_size=False,
-                dtype="float32")
+            x3 = self.data(name="x3", shape=[2, 25], dtype="float32")
 
             # The argument shape's type of reshape_op must be list, tuple or Variable.
             def test_shape_type():
-                fluid.layers.reshape(x3, shape=1)
+                self.reshape(x3, shape=1)
 
             self.assertRaises(TypeError, test_shape_type)
 
             # The argument actual_shape's type of reshape_op must be Variable or None.
             def test_actual_shape_type():
-                fluid.layers.reshape(x3, shape=[25, 2], actual_shape=1)
+                self.reshape(x3, shape=[25, 2], actual_shape=1)
 
             self.assertRaises(TypeError, test_actual_shape_type)
 
             # The argument shape have more than one -1.
             def test_shape_1():
-                fluid.layers.reshape(x3, shape=[-1, -1, 5])
+                self.reshape(x3, shape=[-1, -1, 5])
 
             self.assertRaises(AssertionError, test_shape_1)
 
             # The argument shape have element 0 whose index exceed the input dimension.
             def test_shape_2():
-                fluid.layers.reshape(x3, [2, 5, 5, 0])
+                self.reshape(x3, [2, 5, 5, 0])
 
             self.assertRaises(AssertionError, test_shape_2)
 
             # The argument shape have more than one negative value.
             def test_shape_3():
-                fluid.layers.reshape(x3, [-1, -2, 5])
+                self.reshape(x3, [-1, -2, 5])
 
             self.assertRaises(AssertionError, test_shape_3)
 
+    def test_paddle_api_error(self):
+        self._set_paddle_api()
+        self._test_errors()
+
+    def test_fluid_api_error(self):
+        self._set_fluid_api()
+        self._test_errors()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py
index 360a2de1df6ed8c97dac9ca4473e0b49a240cb5f..9abbee173852baf9db998aad3b71edabdb3e11ed 100644
--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
@@ -26,7 +26,7 @@ paddle.manual_seed(SEED)
 class Generator(fluid.dygraph.Layer):
     def __init__(self):
         super(Generator, self).__init__()
-        self.conv1 = paddle.nn.Conv2D(3, 3, 3, 1)
+        self.conv1 = paddle.nn.Conv2d(3, 3, 3, padding=1)
 
     def forward(self, x):
         x = self.conv1(x)
@@ -37,7 +37,7 @@ class Generator(fluid.dygraph.Layer):
 class Discriminator(fluid.dygraph.Layer):
     def __init__(self):
         super(Discriminator, self).__init__()
-        self.convd = paddle.nn.Conv2D(6, 3, 1)
+        self.convd = paddle.nn.Conv2d(6, 3, 1)
 
     def forward(self, x):
         x = self.convd(x)
@@ -96,8 +96,8 @@ class TestRetainGraph(unittest.TestCase):
         g = Generator()
         d = Discriminator()
 
-        optim_g = paddle.optimizer.Adam(parameter_list=g.parameters())
-        optim_d = paddle.optimizer.Adam(parameter_list=d.parameters())
+        optim_g = paddle.optimizer.Adam(parameters=g.parameters())
+        optim_d = paddle.optimizer.Adam(parameters=d.parameters())
 
         gan_criterion = paddle.nn.MSELoss()
         l1_criterion = paddle.nn.L1Loss()
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index eb12bc741767340a3e7e3580a8b95065d4267693..f7b9d4214d36a422a3ec94dc410e58c6c827ef4c 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
+import paddle
 
 
 def create_selected_rows_and_tensor(scope, place, height, row_num,
@@ -222,5 +223,59 @@ class TestRmspropOp(TestBase):
                         size=size)
 
 
+class TestRMSPropV2(unittest.TestCase):
+    def test_rmsprop_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.RMSProp(
+            learning_rate=0.01,
+            parameters=linear.parameters(),
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_rmsprop(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.RMSProp(learning_rate=0.1)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(ValueError, paddle.optimizer.RMSProp, None)
+        self.assertRaises(
+            ValueError, paddle.optimizer.RMSProp, learning_rate=0.1, rho=None)
+        self.assertRaises(
+            ValueError,
+            paddle.optimizer.RMSProp,
+            learning_rate=0.1,
+            epsilon=None)
+        self.assertRaises(
+            ValueError,
+            paddle.optimizer.RMSProp,
+            learning_rate=0.1,
+            momentum=None)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index 6ca194b2694b6c7537ceb94e11eb1a1a0aeb8d8d..7e2ef36c1a7fda5c31049ec9c752c5226bfb89dc 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -248,7 +248,8 @@ class PolicyGradient(object):
             func=reward_func, x=[action, length], out=reward)
         neg_log_prob = layers.cross_entropy(act_prob, action)
         cost = neg_log_prob * reward
-        cost = (layers.reduce_sum(cost) / layers.reduce_sum(length)
+        cost = (layers.reduce_sum(cost) /
+                layers.cast(layers.reduce_sum(length), "float32")
                 ) if length is not None else layers.reduce_mean(cost)
         optimizer = fluid.optimizer.Adam(self.lr)
         optimizer.minimize(cost)
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py
index 5e9c67c1a7a29b69a977cc94487fc3d26f24eeb8..ce3b060828ac475a10d61bf756423069ab0a70c1 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
@@ -16,6 +16,8 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
+import paddle.fluid as fluid
 from op_test import OpTest
 import paddle.fluid.core as core
 
@@ -173,5 +175,55 @@ class TestScatterOp5(OpTest):
             self.check_grad_with_place(place, ['Updates'], 'Out', in_place=True)
 
 
+class TestScatterAPI(unittest.TestCase):
+    def setUp(self):
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[3, 2], dtype="float64")
+            index = fluid.data(name="index", shape=[4], dtype="int64")
+            updates = fluid.data(name="updates", shape=[4, 2], dtype="float64")
+            result = paddle.scatter(input, index, updates, False)
+
+            input_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float64)
+            index_data = np.array([2, 1, 0, 1]).astype(np.int64)
+            updates_data = np.array(
+                [[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float64)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={
+                                  "input": input_data,
+                                  "index": index_data,
+                                  "updates": updates_data
+                              },
+                              fetch_list=[result])
+            self.assertEqual((fetches[0] == \
+                              np.array([[3., 3.],[6., 6.],[1., 1.]])).all(), True)
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                x_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float64)
+                index_data = np.array([2, 1, 0, 1]).astype(np.int64)
+                updates_data = np.array(
+                    [[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float64)
+
+                x = fluid.dygraph.to_variable(x_data)
+                index = fluid.dygraph.to_variable(index_data)
+                updates = fluid.dygraph.to_variable(updates_data)
+
+                output1 = paddle.scatter(x, index, updates, overwrite=False)
+                self.assertEqual((output1.numpy() == \
+                                  np.array([[3., 3.],[6., 6.],[1., 1.]])).all(), True)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index 6070c84ff236274cc1778d0dce9ab40d884ce7ec..b5a2e84a53ef621f3be81b90d02c10d28fe18162 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -17,9 +17,26 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import six
+import paddle.fluid.core as core
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.fluid import compiler, Program, program_guard
+
+
+def ref_selu(x,
+             scale=1.0507009873554804934193349852946,
+             alpha=1.6732632423543772848170429916717):
+    out = np.copy(x)
+    out_flat = out.flatten()
+    for i in range(out_flat.size):
+        if out_flat[i] < 0:
+            out_flat[i] = alpha * np.exp(out_flat[i]) - alpha
+        out_flat[i] = scale * out_flat[i]
+    out = out_flat.reshape(x.shape)
+    return out
 
 
 class SeluTest(OpTest):
@@ -39,17 +56,10 @@ class SeluTest(OpTest):
         # zero.
         x[np.abs(x) < 0.005] = 0.02
 
-        x_flat = x.flatten()
-
-        for i in range(x_flat.size):
-            if x_flat[i] < 0:
-                x_flat[i] = alpha * np.exp(x_flat[i]) - alpha
-            x_flat[i] = scale * x_flat[i]
-
-        out_np = x_flat.reshape(self.x_shape)
+        out = ref_selu(x, scale, alpha)
 
         self.inputs = {'X': x}
-        self.outputs = {'Out': out_np}
+        self.outputs = {'Out': out}
 
         self.attrs = {
             'alpha': alpha,
@@ -69,17 +79,65 @@ class SeluTest(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestSeluOpError(unittest.TestCase):
+class TestSeluAPI(unittest.TestCase):
+    # test paddle.nn.SELU, paddle.nn.functional.selu
+    def setUp(self):
+        self.scale = 1.5
+        self.alpha = 2.0
+        self.x_np = np.random.normal(size=[3, 5, 5, 10]).astype(np.float64)
+        # Since zero point in selu is not differentiable, avoid randomize
+        # zero.
+        self.x_np[np.abs(self.x_np) < 0.005] = 0.02
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.selu(x, self.scale, self.alpha)
+            selu = paddle.nn.SELU(self.scale, self.alpha)
+            out2 = selu(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_selu(self.x_np, self.scale, self.alpha)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.selu(x, self.scale, self.alpha)
+        selu = paddle.nn.SELU(self.scale, self.alpha)
+        out2 = selu(x)
+        out_ref = ref_selu(self.x_np, self.scale, self.alpha)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.selu(x, self.scale, self.alpha)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_selu(self.x_np, self.scale, self.alpha)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.selu, 1)
+            self.assertRaises(TypeError, F.selu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.selu, x_int32)
-            # support the input dtype is float32
-            x_fp32 = fluid.data(name='x_fp32', shape=[12, 10], dtype='float32')
-            fluid.layers.selu(x_fp32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.selu, x_int32)
+            # The scale must be greater than 1.0
+            x_fp32 = paddle.data(name='x_fp32', shape=[12, 10], dtype='float32')
+            self.assertRaises(ValueError, F.selu, x_fp32, -1.0)
+            # The alpha must be no less than 0
+            self.assertRaises(ValueError, F.selu, x_fp32, 1.6, -1.0)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.selu(x_fp16)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
index b84e3b5377f2796803707dfd68cd5450c512fce7..da5080eabddc93f0c3d08f16e0a7c20b52af47e0 100644
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
@@ -54,5 +55,32 @@ class TestSignOpError(unittest.TestCase):
             fluid.layers.sign(input4)
 
 
+class TestSignAPI(unittest.TestCase):
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([-1., 0., -0., 1.2, 1.5], dtype='float64')
+            x = paddle.to_tensor(np_x)
+            z = paddle.sign(x)
+            np_z = z.numpy()
+            z_expected = np.sign(np_x)
+            self.assertEqual((np_z == z_expected).all(), True)
+
+    def test_static(self):
+        with program_guard(Program(), Program()):
+            # The input type of sign_op must be Variable or numpy.ndarray.
+            input1 = 12
+            self.assertRaises(TypeError, paddle.tensor.math.sign, input1)
+            # The input dtype of sign_op must be float16, float32, float64.
+            input2 = fluid.layers.data(
+                name='input2', shape=[12, 10], dtype="int32")
+            input3 = fluid.layers.data(
+                name='input3', shape=[12, 10], dtype="int64")
+            self.assertRaises(TypeError, paddle.tensor.math.sign, input2)
+            self.assertRaises(TypeError, paddle.tensor.math.sign, input3)
+            input4 = fluid.layers.data(
+                name='input4', shape=[4], dtype="float16")
+            paddle.sign(input4)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a97f57aaae5f290b20e34242b1b43e5e352223d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+
+
+def smooth_l1_loss_forward(val, delta):
+    abs_val = abs(val)
+    if abs_val <= delta:
+        return 0.5 * val * val
+    else:
+        return delta * (abs_val - 0.5 * delta)
+
+
+def smooth_l1_loss_np(input, label, reduction='mean', delta=1.0):
+    diff = input - label
+    out = np.vectorize(smooth_l1_loss_forward)(diff, delta)
+    if reduction == 'sum':
+        return np.sum(out)
+    elif reduction == 'mean':
+        return np.mean(out)
+    elif reduction == 'none':
+        return out
+
+
+class SmoothL1Loss(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+
+    def test_smooth_l1_loss_mean(self):
+        input_np = np.random.random([100, 200]).astype(np.float32)
+        label_np = np.random.random([100, 200]).astype(np.float32)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float32')
+            label = fluid.data(name='label', shape=[100, 200], dtype='float32')
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss()
+            ret = smooth_l1_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss()
+            dy_ret = smooth_l1_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = smooth_l1_loss_np(input_np, label_np, reduction='mean')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    def test_smooth_l1_loss_sum(self):
+        input_np = np.random.random([100, 200]).astype(np.float32)
+        label_np = np.random.random([100, 200]).astype(np.float32)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float32')
+            label = fluid.data(name='label', shape=[100, 200], dtype='float32')
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='sum')
+            ret = smooth_l1_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='sum')
+            dy_ret = smooth_l1_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = smooth_l1_loss_np(input_np, label_np, reduction='sum')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    def test_smooth_l1_loss_none(self):
+        input_np = np.random.random([100, 200]).astype(np.float32)
+        label_np = np.random.random([100, 200]).astype(np.float32)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float32')
+            label = fluid.data(name='label', shape=[100, 200], dtype='float32')
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='none')
+            ret = smooth_l1_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='none')
+            dy_ret = smooth_l1_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = smooth_l1_loss_np(input_np, label_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    def test_smooth_l1_loss_delta(self):
+        input_np = np.random.random([100, 200]).astype(np.float32)
+        label_np = np.random.random([100, 200]).astype(np.float32)
+        delta = np.random.rand()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float32')
+            label = fluid.data(name='label', shape=[100, 200], dtype='float32')
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(delta=delta)
+            ret = smooth_l1_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(delta=delta)
+            dy_ret = smooth_l1_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = smooth_l1_loss_np(input_np, label_np, delta=delta)
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 25e95216968b51e5093497b57768d7eb31952370..04d5cc941a4636da0352fe9221cdad8bdfcd2bd9 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -35,6 +35,15 @@ def stable_softmax(x):
     return exps / np.sum(exps)
 
 
+def ref_softmax(x, axis=None, dtype=None):
+    x_t = x.copy()
+    if dtype is not None:
+        x_t = x_t.astype(dtype)
+    if axis is None:
+        axis = -1
+    return np.apply_along_axis(stable_softmax, axis, x_t)
+
+
 class TestSoftmaxOp(OpTest):
     def get_x_shape(self):
         return [10, 10]
@@ -93,20 +102,6 @@ class TestSoftmaxOp(OpTest):
                 check_dygraph=(self.use_mkldnn == False))
 
 
-class TestSoftmaxOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            # The input type of softmax_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            self.assertRaises(TypeError, fluid.layers.softmax, x1)
-            # The input dtype of softmax_op must be float16, float32 or float64.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.softmax, x2)
-            x3 = fluid.layers.data(name='x3', shape=[4], dtype="float16")
-            fluid.layers.softmax(x3)
-
-
 class TestSoftmaxOp2(TestSoftmaxOp):
     def get_x_shape(self):
         return [2, 3, 4, 5]
@@ -224,41 +219,59 @@ class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
         return [2, 3, 4, 5]
 
 
-class TestNnFunctionalSoftmaxApi(unittest.TestCase):
+class TestSoftmaxAPI(unittest.TestCase):
     def setUp(self):
         self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else paddle.CPUPlace()
         self.x_np = np.random.uniform(-1., 1., [2, 3, 4, 5]).astype('float32')
         self.out_ref = np.apply_along_axis(stable_softmax, -1, self.x_np)
 
-    def test_api_static(self):
-        with program_guard(Program()):
+    def test_static_check(self):
+        with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.data('X', self.x_np.shape, 'float32')
-            out = F.softmax(x)
+            out1 = F.softmax(x)
+            m = paddle.nn.Softmax()
+            out2 = m(x)
             exe = paddle.static.Executor(self.place)
-            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
-        self.assertEqual(np.allclose(self.out_ref, res[0]), True)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_softmax(self.x_np, axis=-1, dtype=None)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
 
-    def test_api_imperative(self):
+    def test_dygraph_check(self):
         paddle.disable_static(self.place)
 
-        x = paddle.to_variable(self.x_np)
-        out = F.softmax(x)
-        self.assertEqual(np.allclose(self.out_ref, out.numpy()), True)
-
-        out = F.softmax(x, axis=0)
-        out_ref = np.apply_along_axis(stable_softmax, 0, self.x_np)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.softmax(x)
+        m = paddle.nn.Softmax()
+        out2 = m(x)
+        out_ref = ref_softmax(self.x_np, axis=-1, dtype=None)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.softmax(x, axis=0)
+        m = paddle.nn.Softmax(axis=0)
+        out2 = m(x)
+        out_ref = ref_softmax(self.x_np, axis=0, dtype=None)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out = F.softmax(x, dtype=np.float64)
+        out_ref = ref_softmax(self.x_np, axis=-1, dtype=np.float64)
         self.assertEqual(np.allclose(out_ref, out.numpy()), True)
 
         paddle.enable_static()
 
     def test_error(self):
-        with program_guard(Program(), Program()):
-            # The x should be variable and its dtype should be float32, float64.
-            self.assertRaises(TypeError, F.softmax, [1])
-
-            x = paddle.data(name='x', shape=[2, 3], dtype='int32')
-            self.assertRaises(TypeError, F.softmax, x)
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.softmax, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[2, 3], dtype='int32')
+            self.assertRaises(TypeError, F.softmax, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[2, 3], dtype='float16')
+            F.softmax(x_fp16)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca92bc75245cebbfdfbbed80e99957d2b4f57b2a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
@@ -0,0 +1,87 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import unittest
+
+import paddle
+import paddle.distributed as dist
+from paddle.distributed.spawn import _get_subprocess_env_list
+
+from paddle.fluid import core
+from paddle.fluid.dygraph import parallel_helper
+
+# NOTE(chenweihang): Coverage CI is currently not able to count python3
+# unittest, so the unittests here covers some cases that will only be 
+# executed in the python3 sub-process. 
+
+
+class TestInitParallelEnv(unittest.TestCase):
+    def test_beckend_type_error(self):
+        with self.assertRaises(TypeError):
+            dist.init_parallel_env(backend=1)
+
+    def test_backend_value_error(self):
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env(backend="mpi")
+
+    def test_check_env_failed(self):
+        os.environ['FLAGS_selected_gpus'] = '0'
+        os.environ['PADDLE_TRAINER_ID'] = '0'
+        os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170'
+        os.environ['PADDLE_TRAINERS_NUM'] = '1'
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env()
+
+    def test_init_parallel_env_break(self):
+        os.environ['FLAGS_selected_gpus'] = '0'
+        os.environ['PADDLE_TRAINER_ID'] = '0'
+        os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170'
+        os.environ['PADDLE_TRAINERS_NUM'] = '1'
+        os.environ['PADDLE_TRAINER_ENDPOINTS'] = '127.0.0.1:6170'
+        # coverage success branch
+        dist.init_parallel_env()
+        self.assertFalse(parallel_helper._is_parallel_ctx_initialized())
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSpawnAssistMethod(unittest.TestCase):
+    def test_only_cluster_node_ips_error(self):
+        with self.assertRaises(ValueError):
+            options = dict()
+            options['cluster_node_ips'] = "127.0.0.1,127.0.0.2"
+            _get_subprocess_env_list(nprocs=1, options=options)
+
+    def test_nprocs_greater_than_device_num_error(self):
+        with self.assertRaises(RuntimeError):
+            _get_subprocess_env_list(nprocs=100, options=dict())
+
+    def test_selected_gpus_error(self):
+        with self.assertRaises(ValueError):
+            options = dict()
+            options['selected_gpus'] = "100,101"
+            _get_subprocess_env_list(nprocs=2, options=options)
+
+    def test_get_correct_env(self):
+        env_dict = _get_subprocess_env_list(nprocs=1, options=dict())[0]
+        self.assertEqual(env_dict['PADDLE_TRAINER_ID'], '0')
+        self.assertEqual(env_dict['PADDLE_TRAINERS_NUM'], '1')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_std_layer.py b/python/paddle/fluid/tests/unittests/test_std_layer.py
index d1e0056304204bf0dbe47982bbf4b9574acf8eac..e455151481443c1fb918efd9e44444536adc6b7f 100644
--- a/python/paddle/fluid/tests/unittests/test_std_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_std_layer.py
@@ -15,65 +15,104 @@
 import unittest
 import numpy as np
 import paddle
-import paddle.fluid as fluid
 
 
-class TestStdLayer(unittest.TestCase):
+def ref_std(x, axis=None, unbiased=True, keepdim=False):
+    ddof = 1 if unbiased else 0
+    if isinstance(axis, int):
+        axis = (axis, )
+    if axis is not None:
+        axis = tuple(axis)
+    return np.std(x, axis=axis, ddof=ddof, keepdims=keepdim)
+
+
+class TestStdAPI(unittest.TestCase):
     def setUp(self):
-        self._dtype = "float64"
-        self._input = np.random.random([2, 3, 4, 5]).astype(self._dtype)
-
-    def static(self, axis=None, keepdim=False, unbiased=True):
-        prog = fluid.Program()
-        with fluid.program_guard(prog):
-            data = fluid.data(
-                name="data", dtype=self._dtype, shape=[None, 3, 4, 5])
-            out = prog.current_block().create_var(
-                dtype=self._dtype, shape=[2, 3, 4, 5])
-            paddle.std(input=data,
-                       axis=axis,
-                       keepdim=keepdim,
-                       unbiased=unbiased,
-                       out=out)
-
-        exe = fluid.Executor(self._place)
-        return exe.run(feed={"data": self._input},
-                       program=prog,
-                       fetch_list=[out])[0]
-
-    def dynamic(self, axis=None, keepdim=False, unbiased=True):
-        with fluid.dygraph.guard(self._place):
-            data = fluid.dygraph.to_variable(self._input)
-            out = paddle.std(input=data,
-                             axis=axis,
-                             keepdim=keepdim,
-                             unbiased=unbiased)
-            return out.numpy()
-
-    def numpy(self, axis=None, keepdim=False, unbiased=True):
-        ddof = 1 if unbiased else 0
-        axis = tuple(axis) if isinstance(axis, list) else axis
-        return np.std(self._input, axis=axis, keepdims=keepdim, ddof=ddof)
-
-    def test_equal(self):
-        places = []
-        if fluid.core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            self._place = place
-            self.assertTrue(np.allclose(self.numpy(), self.static()))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(axis=[0, 2]), self.dynamic(axis=[0, 2])))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(
-                        axis=[1, 3], keepdim=True),
-                    self.dynamic(
-                        axis=[1, 3], keepdim=True)))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(unbiased=False), self.dynamic(unbiased=False)))
+        self.dtype = 'float64'
+        self.shape = [1, 3, 4, 10]
+        self.axis = [1, 3]
+        self.keepdim = False
+        self.unbiased = True
+        self.set_attrs()
+        self.x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        self.place=paddle.CUDAPlace(0) \
+            if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def set_attrs(self):
+        pass
+
+    def static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.shape, self.dtype)
+            out = paddle.std(x, self.axis, self.unbiased, self.keepdim)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x}, fetch_list=[out])
+        return res[0]
+
+    def dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        out = paddle.std(x, self.axis, self.unbiased, self.keepdim)
+        paddle.enable_static()
+        return out.numpy()
+
+    def test_api(self):
+        out_ref = ref_std(self.x, self.axis, self.unbiased, self.keepdim)
+        out_dygraph = self.dygraph()
+        out_static = self.static()
+        for out in [out_dygraph, out_static]:
+            self.assertTrue(np.allclose(out_ref, out))
+            self.assertTrue(np.equal(out_ref.shape, out.shape).all())
+
+
+class TestStdAPI_dtype(TestStdAPI):
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+
+class TestStdAPI_axis_int(TestStdAPI):
+    def set_attrs(self):
+        self.axis = 2
+
+
+class TestStdAPI_axis_list(TestStdAPI):
+    def set_attrs(self):
+        self.axis = [1, 2]
+
+
+class TestStdAPI_axis_tuple(TestStdAPI):
+    def set_attrs(self):
+        self.axis = (1, 3)
+
+
+class TestStdAPI_keepdim(TestStdAPI):
+    def set_attrs(self):
+        self.keepdim = False
+
+
+class TestStdAPI_unbiased(TestStdAPI):
+    def set_attrs(self):
+        self.unbiased = False
+
+
+class TestStdAPI_alias(unittest.TestCase):
+    def test_alias(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(np.array([10, 12], 'float32'))
+        out1 = paddle.std(x).numpy()
+        out2 = paddle.tensor.std(x).numpy()
+        out3 = paddle.tensor.stat.std(x).numpy()
+        self.assertTrue(np.allclose(out1, out2))
+        self.assertTrue(np.allclose(out1, out3))
+        paddle.enable_static()
+
+
+class TestStdError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [2, 3, 4], 'int32')
+            self.assertRaises(TypeError, paddle.std, x)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index 806b6b90e7e2d39a3d4e5f3792cd849022097777..b0701a9b187f6c7cf63f43d69f482ea13e6d3fe3 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -22,6 +22,7 @@ import unittest
 import numpy as np
 import os
 import six
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler
@@ -209,7 +210,7 @@ class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
             return
 
         with program_guard(Program(), Program()):
-            my_sync_batch_norm = fluid.dygraph.SyncBatchNorm(10)
+            my_sync_batch_norm = paddle.nn.SyncBatchNorm(10)
             x1 = fluid.create_lod_tensor(
                 np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CUDAPlace(0))
             self.assertRaises(TypeError, my_sync_batch_norm, x1)
diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py
index 73e6ff5dbd69c3ce8d632ad8982eaf3826c2cc60..5aaf31993448ab0ff0c69f648cfa84c62d3e198b 100644
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
@@ -215,6 +215,16 @@ class TestTileError(unittest.TestCase):
             self.assertRaises(ValueError, paddle.tile, x3, repeat_times)
 
 
+class TestTileAPIStatic(unittest.TestCase):
+    def test_api(self):
+        with program_guard(Program(), Program()):
+            repeat_times = [2, 2]
+            x1 = fluid.layers.data(name='x1', shape=[4], dtype="int32")
+            out = paddle.tile(x1, repeat_times)
+            positive_2 = fluid.layers.fill_constant([1], dtype="int32", value=2)
+            out2 = paddle.tile(x1, repeat_times=[positive_2, 2])
+
+
 # Test python API
 class TestTileAPI(unittest.TestCase):
     def test_api(self):
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..54e7765c0fb76844a6123fceea6c1ef79dc0c2bf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
@@ -0,0 +1,244 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+
+
+def numpy_topk(x, k=1, axis=-1, largest=True):
+    if axis < 0:
+        axis = len(x.shape) + axis
+    if largest:
+        indices = np.argsort(-x, axis=axis)
+    else:
+        indices = np.argsort(x, axis=axis)
+    if largest:
+        value = -np.sort(-x, axis=axis)
+    else:
+        value = np.sort(x, axis=axis)
+    indices = indices.take(indices=range(0, k), axis=axis)
+    value = value.take(indices=range(0, k), axis=axis)
+    return value, indices
+
+
+class TestTopkOp(OpTest):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 20)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+
+    def test_check_grad(self):
+        paddle.enable_static()
+        self.check_grad(set(['X']), 'Out')
+
+
+class TestTopOp1(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 0
+        self.largest = True
+
+
+class TestTopOp2(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopOp3(TestTopkOp):
+    def init_args(self):
+        self.k = 4
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopOp4(TestTopkOp):
+    def init_args(self):
+        self.k = 4
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopkOp5(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp6(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopKAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.input_data = np.random.rand(6, 7, 8)
+        self.large_input_data = np.random.rand(2, 1030)
+
+    def run_dygraph(self, place):
+        paddle.disable_static(place)
+        input_tensor = paddle.to_tensor(self.input_data)
+        large_input_tensor = paddle.to_tensor(self.large_input_data)
+        # test case for basic test case 1
+        paddle_result = paddle.topk(input_tensor, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 2 with axis
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 3 with tensor K
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 4 with tensor largest
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 5 with axis -1
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 6 for the partial sort 
+        paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1)
+        numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 7 for the unsorted 
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+        sort_paddle = numpy_topk(
+            np.array(paddle_result[0].numpy()), axis=1, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def run_static(self, place):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            input_tensor = paddle.static.data(
+                name="x", shape=[6, 7, 8], dtype="float64")
+            large_input_tensor = paddle.static.data(
+                name="large_x", shape=[2, 1030], dtype="float64")
+            k_tensor = paddle.static.data(name="k", shape=[1], dtype="int32")
+            result1 = paddle.topk(input_tensor, k=2)
+            result2 = paddle.topk(input_tensor, k=2, axis=-1)
+            result3 = paddle.topk(input_tensor, k=k_tensor, axis=1)
+            result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+            result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+            result6 = paddle.topk(large_input_tensor, k=1, axis=-1)
+            result7 = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+            exe = paddle.static.Executor(place)
+            input_data = np.random.rand(10, 20).astype("float64")
+            large_input_data = np.random.rand(2, 100).astype("float64")
+            paddle_result = exe.run(
+                feed={
+                    "x": self.input_data,
+                    "large_x": self.large_input_data,
+                    "k": np.array([2]).astype("int32")
+                },
+                fetch_list=[
+                    result1[0], result1[1], result2[0], result2[1], result3[0],
+                    result3[1], result4[0], result4[1], result5[0], result5[1],
+                    result6[0], result6[1], result7[0], result7[1]
+                ])
+            numpy_result = numpy_topk(self.input_data, k=2)
+            self.assertTrue(np.allclose(paddle_result[0], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[1], numpy_result[1]))
+            numpy_result = numpy_topk(self.input_data, k=2, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[2], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[3], numpy_result[1]))
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(paddle_result[4], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[5], numpy_result[1]))
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[6], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[7], numpy_result[1]))
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=-1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[8], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[9], numpy_result[1]))
+            numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[10], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[11], numpy_result[1]))
+            sort_paddle = numpy_topk(paddle_result[12], axis=1, k=2)
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def test_cases(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.run_dygraph(place)
+            self.run_static(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fea9f69a18c83be0f6af05784735ea53d0993d2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -0,0 +1,479 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.nn.layer.transformer import MultiHeadAttention, TransformerEncoderLayer, TransformerDecoderLayer, TransformerEncoder, TransformerDecoder, Transformer
+
+import unittest
+
+
+def generate_basic_params(mode="attn", self_attention=True):
+    batch_size, query_length = [np.random.randint(2, 10) for _ in range(2)]
+    d_head, num_heads = [np.random.randint(3, 10) for _ in range(2)]
+    attn_dropout = 0.0
+    embed_dim = d_head * num_heads
+    if mode == "attn":
+        if self_attention:
+            kdim, vdim = embed_dim, embed_dim
+            key_length, value_length = query_length, query_length
+        else:
+            kdim, vdim = [np.random.randint(5, 20) for _ in range(2)]
+            key_length = np.random.randint(2, 10)
+            value_length = key_length
+        return batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout
+
+    else:
+        dropout, act_dropout = 0.0, 0.0
+        dim_feedforward = np.random.randint(128, 1024)
+        sequence_length = np.random.randint(2, 10)
+        if mode == "encoder_layer":
+            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length
+        elif mode == "decoder_layer":
+            target_length = np.random.randint(2, 10)
+            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length, target_length
+
+
+def generate_query_key_value_cache(self_attention,
+                                   batch_size,
+                                   num_heads,
+                                   query_length,
+                                   embed_dim,
+                                   key_length=None,
+                                   value_length=None,
+                                   kdim=None,
+                                   vdim=None,
+                                   cache=None):
+    query = np.random.rand(batch_size, query_length,
+                           embed_dim).astype("float32")
+    attn_mask = np.zeros((batch_size, num_heads, query_length, key_length))
+    attn_mask[0][0][0][0] = -1e9
+
+    head_dim = embed_dim // num_heads
+    if self_attention:
+        key, value = query, query
+    else:
+        key = np.random.rand(batch_size, key_length, kdim).astype("float32")
+        value = np.random.rand(batch_size, value_length, vdim).astype("float32")
+    cache_dict = {}
+    if cache:
+        if not self_attention:
+            cache_dict["static_k"] = np.random.rand(
+                batch_size, num_heads, key_length, head_dim).astype("float32")
+            cache_dict["static_v"] = np.random.rand(
+                batch_size, num_heads, value_length, head_dim).astype("float32")
+        else:
+            cache_dict["k"] = np.random.rand(batch_size, num_heads, key_length,
+                                             head_dim).astype("float32")
+            cache_dict["v"] = np.random.rand(
+                batch_size, num_heads, value_length, head_dim).astype("float32")
+    else:
+        cache_dict = None
+    return query, key, value, attn_mask, cache_dict
+
+
+def fc(x, weight):
+    return np.matmul(x, weight)
+
+
+def softmax(x):
+    np.seterr(invalid='ignore')
+    output = np.zeros(x.shape, dtype=np.float64)
+    for i in range(x.shape[0]):
+        for j in range(x.shape[1]):
+            for k in range(x.shape[2]):
+                x_curr = x[i, j, k, :]
+                e_x = np.exp(x_curr - np.amax(x_curr))
+                output[i, j, k, :] = e_x / np.sum(e_x)
+    return output
+
+
+def batch_matmul(x, y):
+    assert x.shape[0] == y.shape[0]
+    assert x.shape[1] == y.shape[1]
+    retval = np.zeros(
+        (x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64)
+    for i in range(x.shape[0]):
+        for j in range(x.shape[1]):
+            retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :])
+    return retval
+
+
+def scaled_dot_product_attention(q, k, v, d_key, attn_mask, multi_head_attn):
+    k = k.transpose([0, 1, 3, 2])
+    qkt = batch_matmul(q, k / np.sqrt(d_key, dtype=np.float64))
+    if attn_mask is not None:
+        qkt += attn_mask
+    weight = softmax(qkt)
+    attn_heads = batch_matmul(weight, v)
+    attn_heads = attn_heads.transpose((0, 2, 1, 3))
+    attn_heads = attn_heads.reshape((attn_heads.shape[0], attn_heads.shape[1],
+                                     attn_heads.shape[2] * attn_heads.shape[3]))
+    return attn_heads
+
+
+def cal_qkv(key, value, num_heads, embed_dim, multi_head_attn):
+    with fluid.dygraph.guard():
+        head_dim = embed_dim // num_heads
+        k_weight = multi_head_attn.k_proj.weight.numpy()
+        v_weight = multi_head_attn.v_proj.weight.numpy()
+        k = fc(key, k_weight)
+        v = fc(value, v_weight)
+        k = k.reshape((k.shape[0], k.shape[1], num_heads, head_dim))
+        k = k.transpose((0, 2, 1, 3))
+        v = v.reshape((v.shape[0], v.shape[1], num_heads, head_dim))
+        v = v.transpose((0, 2, 1, 3))
+        return k, v
+
+
+def prepare_qkv(query, key, value, num_heads, embed_dim, self_attention,
+                multi_head_attn, cache_dict):
+    q_weight = multi_head_attn.q_proj.weight.numpy()
+    q = fc(query, q_weight)
+    q = q.reshape((q.shape[0], q.shape[1], num_heads, embed_dim // num_heads))
+    q = q.transpose((0, 2, 1, 3))
+
+    if not self_attention and cache_dict:
+        k, v = cache_dict["static_k"], cache_dict["static_v"]
+    else:
+        k, v = cal_qkv(key, value, num_heads, embed_dim, multi_head_attn)
+        if cache_dict is not None:
+            k = np.concatenate((cache_dict["k"], k), axis=2)
+            v = np.concatenate((cache_dict["v"], v), axis=2)
+    return (q, k, v, cache_dict)
+
+
+def add(x, y=None):
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        x = x.numpy() if not isinstance(x, np.ndarray) else x
+        if y is not None:
+            x += y
+            return x
+        return x
+
+
+def relu(x):
+    compare = x > 0
+    return x * compare
+
+
+def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None):
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        # scale:
+        weight = norm.weight.numpy()
+        # shift:
+        bias = norm.bias.numpy()
+
+        batch_size, src_len, d_model = x.shape
+        x = x.reshape((batch_size * src_len, d_model))
+        mu = np.mean(x, axis=1, keepdims=True)
+        sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
+        x1_up = (x - mu)
+        x1_down_1 = sigma_squar + epsilon
+        x1_down = np.sqrt(x1_down_1)
+        x1_down = x1_down.reshape((x1_down.shape[0], 1))
+        x1 = x1_up / x1_down
+        x_scaled = weight * x1
+        x_scaled_bias = x_scaled + bias
+        x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model))
+    return x_scaled_bias
+
+
+def ffn(src, encoder_layer, ffn_fc1_act="relu"):
+    assert ffn_fc1_act == "relu", "only relu is supported"
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        src = src.numpy() if not isinstance(src, np.ndarray) else src
+        w1 = encoder_layer.linear1.weight.numpy()
+        w2 = encoder_layer.linear2.weight.numpy()
+        # fc1
+        x1 = fc(src, w1)
+        x1 = relu(x1)
+        # fc2
+        x2 = fc(x1, w2)
+        return x2
+
+
+class TestTransformer(unittest.TestCase):
+    def test_multi_head_attention(self):
+        def multihead_attention_test_helper(self_attention, cache):
+            paddle.manual_seed(2020)
+            paddle.framework.random._manual_program_seed(2020)
+            # self_attention|cross_attention, cache|No cache
+            with fluid.dygraph.guard(fluid.CPUPlace()):
+
+                # generate params for multi_head_attention
+                batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params(
+                    "attn", self_attention)
+                query, key, value, attn_mask, cache_dict = generate_query_key_value_cache(
+                    self_attention, batch_size, num_heads, query_length,
+                    embed_dim, key_length, value_length, kdim, vdim, cache)
+                if cache and self_attention:
+                    attn_mask = np.concatenate((attn_mask, attn_mask), axis=3)
+                need_weight, param_attr, bias_attr = False, None, None
+                # call paddle's function
+                multi_head_attn = MultiHeadAttention(
+                    embed_dim, num_heads, attn_dropout, kdim, vdim, need_weight,
+                    param_attr, bias_attr)
+                # construct cache object
+                cache_obj = None
+                if cache_dict:
+                    if 'k' and 'v' in cache_dict:
+                        cache_obj = multi_head_attn.Cache(
+                            paddle.to_variable(cache_dict['k']),
+                            paddle.to_variable(cache_dict['v']))
+                    elif 'static_k' and 'static_v' in cache_dict:
+                        cache_obj = multi_head_attn.StaticCache(
+                            paddle.to_variable(cache_dict['static_k']),
+                            paddle.to_variable(cache_dict['static_v']))
+                if attn_mask is not None:
+                    attn_output = multi_head_attn(
+                        paddle.to_variable(query),
+                        paddle.to_variable(key),
+                        paddle.to_variable(value),
+                        paddle.to_variable(attn_mask), cache_obj)
+                else:
+                    attn_output = multi_head_attn(
+                        paddle.to_variable(query),
+                        paddle.to_variable(key),
+                        paddle.to_variable(value), attn_mask, cache_obj)
+                attn_output = attn_output[0] if cache_dict else attn_output
+
+                # implementation by numpy
+                # compute q, k, v
+                q, k, v, _ = prepare_qkv(query, key, value, num_heads,
+                                         embed_dim, self_attention,
+                                         multi_head_attn, cache_dict)
+                # scale dot product attention
+                attn_heads = scaled_dot_product_attention(
+                    q, k, v, embed_dim // num_heads, attn_mask, multi_head_attn)
+                out_proj_weight = multi_head_attn.out_proj.weight.numpy()
+                reference = fc(attn_heads, out_proj_weight)
+
+                np.testing.assert_allclose(
+                    attn_output.numpy(), reference, atol=1e-6)
+
+        multihead_attention_test_helper(True, True)
+        multihead_attention_test_helper(True, False)
+        multihead_attention_test_helper(False, True)
+        multihead_attention_test_helper(False, False)
+
+    def test_transformer_encoder_layer(self):
+
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            paddle.framework.manual_seed(2020)
+            paddle.framework.random._manual_program_seed(2020)
+
+            ffn_fc1_act = "relu"
+            # 1.generate basic params
+            batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
+                mode="encoder_layer")
+            # 2.generate input for encoder
+            src = np.random.rand(batch_size, sequence_length,
+                                 d_model).astype("float32")
+            residual = src
+            src_mask = np.zeros((batch_size, n_head, sequence_length,
+                                 sequence_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+
+            # paddle
+            encoder_layer = TransformerEncoderLayer(
+                d_model, n_head, dim_feedforward, dropout, ffn_fc1_act,
+                attn_dropout, act_dropout)
+
+            encoder_output = encoder_layer(
+                paddle.to_variable(src),
+                paddle.to_variable(src_mask))  # paddle.to_variable(src_mask))
+            # 4.numpy:
+            # paddle self attention
+            self_attn = MultiHeadAttention(
+                d_model, n_head, dropout=attn_dropout)
+            attn_output = self_attn(
+                paddle.to_variable(src),
+                paddle.to_variable(src),
+                paddle.to_variable(src), paddle.to_variable(src_mask)).numpy()
+
+            src = attn_output + residual
+            src_norm = layer_norm(src, d_model, encoder_layer.norm1)
+            residual = src_norm
+
+            ffn_output = ffn(src_norm, encoder_layer, ffn_fc1_act)
+            src = residual + ffn_output
+            src = layer_norm(src, d_model, encoder_layer.norm2)
+
+            np.testing.assert_allclose(
+                encoder_output.numpy(), src, rtol=1e-5, atol=1e-6)
+
+    def test_transformer_decoder_layer(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            paddle.framework.manual_seed(2020)
+            activation = "relu"
+            normalize_before = False
+            batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params(
+                mode="decoder_layer")
+            tgt = np.random.rand(batch_size, target_length,
+                                 d_model).astype("float32")
+            memory = np.random.rand(batch_size, source_length,
+                                    d_model).astype("float32")
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            for cache in [True, False]:
+                self_attn = MultiHeadAttention(
+                    d_model, n_head, dropout=attn_dropout)
+                cross_attn = MultiHeadAttention(
+                    d_model, n_head, dropout=attn_dropout)
+
+                # paddle decoderlayer:
+                decoder_layer = TransformerDecoderLayer(
+                    d_model, n_head, dim_feedforward, dropout, activation,
+                    attn_dropout, act_dropout, normalize_before)
+                cache_objs = None
+                if cache:
+                    cache_objs = decoder_layer.gen_cache(
+                        paddle.to_variable(memory))
+
+                decoder_output = decoder_layer(
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(memory),
+                    paddle.to_variable(tgt_mask),
+                    paddle.to_variable(memory_mask), cache_objs)
+
+                decoder_output = decoder_output[0].numpy(
+                ) if cache else decoder_output.numpy()
+
+                # numpy:
+                residual = tgt
+                # self-attn
+                self_attn_cache = cache_objs[
+                    0] if cache_objs is not None else None
+                tgt = self_attn(
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt_mask), self_attn_cache)
+
+                tgt = tgt[0].numpy() if cache else tgt.numpy()
+
+                tgt = residual + tgt
+                # postprocess
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm1)
+                residual = tgt_norm
+                # cross-attn
+                cross_attn_cache = cache_objs[
+                    1] if cache_objs is not None else None
+                tgt = cross_attn(
+                    paddle.to_variable(tgt_norm),
+                    paddle.to_variable(memory),
+                    paddle.to_variable(memory),
+                    paddle.to_variable(memory_mask), cross_attn_cache)
+                tgt = tgt[0].numpy() if cache else tgt.numpy()
+
+                # postprocess
+                tgt = tgt + residual
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm2)
+                residual = tgt_norm
+                # FFN
+                ffn_output = ffn(tgt_norm, decoder_layer, activation)
+                # post process
+                tgt = residual + ffn_output
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3)
+
+                np.testing.assert_allclose(
+                    decoder_output, tgt_norm, rtol=1e-5, atol=1e-6)
+
+    def test_encoder(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
+            mode="encoder_layer")
+
+        src = np.random.rand(batch_size, sequence_length,
+                             d_model).astype("float32")
+
+        src_mask = np.zeros((batch_size, n_head, sequence_length,
+                             sequence_length)).astype("float32")
+        src_mask[0][0][0][0] = -np.inf
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            encoder_layer = TransformerEncoderLayer(d_model, n_head,
+                                                    dim_feedforward, dropout)
+            num_layers = 6
+            encoder = TransformerEncoder(encoder_layer, num_layers)
+            # src, src_mask
+            enc_output = encoder(
+                paddle.to_variable(src), paddle.to_variable(src_mask))
+
+    def test_decoder(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+        tgt = np.random.rand(batch_size, target_length,
+                             d_model).astype("float32")
+        memory = np.random.rand(batch_size, source_length,
+                                d_model).astype("float32")
+        tgt_mask = np.zeros((batch_size, n_head, target_length,
+                             target_length)).astype("float32")
+        tgt_mask[0][0][0][0] = -1e9
+        memory_mask = np.zeros((batch_size, n_head, target_length,
+                                source_length)).astype("float32")
+        memory_mask[0][0][0][0] = -1e9
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            decoder_layer = TransformerDecoderLayer(d_model, n_head,
+                                                    dim_feedforward, dropout)
+            num_layers = 6
+            decoder = TransformerDecoder(decoder_layer, num_layers)
+
+            output = decoder(
+                paddle.to_variable(tgt),
+                paddle.to_variable(memory),
+                paddle.to_variable(tgt_mask), paddle.to_variable(memory_mask))
+
+    def test_transformer(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout)
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..49924b44441aa9ae323f0d7921d71bf58b8c2cf2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
@@ -0,0 +1,681 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.nn.functional import interpolate
+
+
+def trilinear_interp_np(input,
+                        out_d,
+                        out_h,
+                        out_w,
+                        out_size=None,
+                        actual_shape=None,
+                        align_corners=True,
+                        align_mode=0,
+                        data_layout='NCDHW'):
+    """trilinear interpolation implement in shape [N, C, D, H, W]"""
+    if data_layout == "NDHWC":
+        input = np.transpose(input, (0, 4, 1, 2, 3))  # NDHWC => NCDHW
+    if out_size is not None:
+        out_d = out_size[0]
+        out_h = out_size[1]
+        out_w = out_size[2]
+    if actual_shape is not None:
+        out_d = actual_shape[0]
+        out_h = actual_shape[1]
+        out_w = actual_shape[2]
+    batch_size, channel, in_d, in_h, in_w = input.shape
+
+    ratio_d = ratio_h = ratio_w = 0.0
+    if out_d > 1:
+        if (align_corners):
+            ratio_d = (in_d - 1.0) / (out_d - 1.0)
+        else:
+            ratio_d = 1.0 * in_d / out_d
+    if out_h > 1:
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+    if out_w > 1:
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((batch_size, channel, out_d, out_h, out_w))
+
+    for i in range(out_d):
+        if (align_mode == 0 and not align_corners):
+            d = int(ratio_d * (i + 0.5) - 0.5)
+        else:
+            d = int(ratio_d * i)
+
+        d = max(0, d)
+        did = 1 if d < in_d - 1 else 0
+        if (align_mode == 0 and not align_corners):
+            idx_src_d = max(ratio_d * (i + 0.5) - 0.5, 0)
+            d1lambda = idx_src_d - d
+        else:
+            d1lambda = ratio_d * i - d
+        d2lambda = 1.0 - d1lambda
+
+        for j in range(out_h):
+            if (align_mode == 0 and not align_corners):
+                h = int(ratio_h * (j + 0.5) - 0.5)
+            else:
+                h = int(ratio_h * j)
+
+            h = max(0, h)
+            hid = 1 if h < in_h - 1 else 0
+            if (align_mode == 0 and not align_corners):
+                idx_src_h = max(ratio_h * (j + 0.5) - 0.5, 0)
+                h1lambda = idx_src_h - h
+            else:
+                h1lambda = ratio_h * j - h
+            h2lambda = 1.0 - h1lambda
+
+            for k in range(out_w):
+                if (align_mode == 0 and not align_corners):
+                    w = int(ratio_w * (k + 0.5) - 0.5)
+                else:
+                    w = int(ratio_w * k)
+                w = max(0, w)
+                wid = 1 if w < in_w - 1 else 0
+                if (align_mode == 0 and not align_corners):
+                    idx_src_w = max(ratio_w * (k + 0.5) - 0.5, 0)
+                    w1lambda = idx_src_w - w
+                else:
+                    w1lambda = ratio_w * k - w
+                w2lambda = 1.0 - w1lambda
+
+                out[:, :, i, j, k] = \
+                    d2lambda * \
+                    (h2lambda * (w2lambda * input[:, :, d, h, w] + \
+                              w1lambda * input[:, :, d, h, w+wid]) + \
+                    h1lambda * (w2lambda * input[:, :, d, h+hid, w] + \
+                              w1lambda * input[:, :, d, h+hid, w+wid])) + \
+                    d1lambda * \
+                    (h2lambda * (w2lambda * input[:, :, d+did, h, w] + \
+                              w1lambda * input[:, :, d+did, h, w+wid]) + \
+                    h1lambda * (w2lambda * input[:, :, d+did, h+hid, w] + \
+                              w1lambda * input[:, :, d+did, h+hid, w+wid]))
+    if data_layout == "NDHWC":
+        out = np.transpose(out, (0, 2, 3, 4, 1))  # NCDHW => NDHWC
+
+    return out.astype(input.dtype)
+
+
+class TestTrilinearInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCDHW'
+        self.init_test_case()
+        self.op_type = "trilinear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float32")
+
+        if self.data_layout == "NCDHW":
+            in_d = self.input_shape[2]
+            in_h = self.input_shape[3]
+            in_w = self.input_shape[4]
+        else:
+            in_d = self.input_shape[1]
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_d = scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_d = scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[2]
+                scale_h = self.scale[1]
+                scale_d = self.scale[0]
+            out_d = int(in_d * scale_d)
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_d = self.out_d
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = trilinear_interp_np(
+            input_np, out_d, out_h, out_w, self.out_size, self.actual_shape,
+            self.align_corners, self.align_mode, self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        # c++ end treat NCDHW the same way as NCHW
+        if self.data_layout == 'NCDHW':
+            data_layout = 'NCHW'
+        else:
+            data_layout = 'NHWC'
+        self.attrs = {
+            'out_d': self.out_d,
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': data_layout
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 4, 4, 4]
+        self.out_d = 2
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase1(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 1, 7, 8, 9]
+        self.out_d = 1
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase2(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 9, 6, 8]
+        self.out_d = 12
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase3(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [3, 2, 16, 8, 4]
+        self.out_d = 32
+        self.out_h = 16
+        self.out_w = 8
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase4(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [4, 1, 7, 8, 9]
+        self.out_d = 1
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2, 2]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase5(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [3, 3, 9, 6, 8]
+        self.out_d = 12
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = np.array([11, 11, 11]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase6(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [1, 1, 16, 8, 4]
+        self.out_d = 8
+        self.out_h = 32
+        self.out_w = 16
+        self.scale = 0.
+        self.out_size = np.array([17, 9, 5]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpSame(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [1, 1, 16, 8, 4]
+        self.out_d = 16
+        self.out_h = 8
+        self.out_w = 4
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpSameHW(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [1, 1, 16, 8, 4]
+        self.out_d = 8
+        self.out_h = 8
+        self.out_w = 4
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpActualShape(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [3, 2, 16, 8, 4]
+        self.out_d = 64
+        self.out_h = 32
+        self.out_w = 16
+        self.scale = 0.
+        self.out_size = np.array([33, 19, 7]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpDatalayout(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 4, 4, 4, 3]
+        self.out_d = 2
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+        self.data_layout = "NDHWC"
+
+
+class TestTrilinearInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "trilinear_interp_v2"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_d = scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_d = scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[2]
+                scale_h = self.scale[1]
+                scale_d = self.scale[0]
+            out_d = int(self.input_shape[2] * scale_d)
+            out_h = int(self.input_shape[3] * scale_h)
+            out_w = int(self.input_shape[4] * scale_w)
+        else:
+            out_d = self.out_d
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+                                        self.out_size, self.actual_shape,
+                                        self.align_corners, self.align_mode)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+
+        self.attrs = {
+            'out_d': self.out_d,
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [1, 3, 9, 6, 8]
+        self.out_d = 13
+        self.out_h = 10
+        self.out_w = 9
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase1Uint8(TestTrilinearInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 16, 8, 4]
+        self.out_d = 13
+        self.out_h = 7
+        self.out_w = 2
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase2Uint8(TestTrilinearInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [4, 1, 7, 8, 9]
+        self.out_d = 3
+        self.out_h = 5
+        self.out_w = 13
+        self.scale = 0.
+        self.out_size = np.array([6, 15, 21]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpOtherMethod1(TestTrilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestTrilinearInterpWithMethod2(TestTrilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestTrilinearInterpWithMethod3(TestTrilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = True
+        self.align_mode = 0
+
+
+class TestTrilinearInterpScale1(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 5, 7, 9]
+        self.out_d = 82
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 2.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpScale2(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 5, 7, 9]
+        self.out_d = 60
+        self.out_h = 40
+        self.out_w = 25
+        self.scale = 1.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpScale3(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 5, 7, 9]
+        self.out_d = 60
+        self.out_h = 40
+        self.out_w = 25
+        self.scale = 1.5
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpZero(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 5, 7, 11]
+        self.out_d = 60
+        self.out_h = 40
+        self.out_w = 25
+        self.scale = 0.2
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestTrilinearInterpOp_attr_tensor(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "trilinear_interp_v2"
+        self.shape_by_1Dtensor = False
+        self.scale_by_1Dtensor = False
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
+        }
+
+        input_np = np.random.random(self.input_shape).astype("float32")
+        self.inputs = {'X': input_np}
+
+        if self.scale_by_1Dtensor:
+            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
+        elif self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_d = scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_d = scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[2]
+                scale_h = self.scale[1]
+                scale_d = self.scale[0]
+            out_d = int(self.input_shape[2] * scale_d)
+            out_h = int(self.input_shape[3] * scale_h)
+            out_w = int(self.input_shape[4] * scale_w)
+        else:
+            out_d = self.out_d
+            out_h = self.out_h
+            out_w = self.out_w
+
+        if self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.out_size
+        elif self.out_size is not None:
+            size_tensor = []
+            for index, ele in enumerate(self.out_size):
+                size_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+            self.inputs['SizeTensor'] = size_tensor
+
+        self.attrs['out_d'] = self.out_d
+        self.attrs['out_h'] = self.out_h
+        self.attrs['out_w'] = self.out_w
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+                                        self.out_size, self.actual_shape,
+                                        self.align_corners, self.align_mode)
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 4, 4, 4]
+        self.out_d = 2
+        self.out_h = 3
+        self.out_w = 3
+        self.scale = 0.
+        self.out_size = [2, 3, 3]
+        self.align_corners = True
+        self.align_mode = 1
+
+
+# out_size is a 1-D tensor
+class TestTrilinearInterp_attr_tensor_Case1(TestTrilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [3, 2, 9, 6, 8]
+        self.out_d = 32
+        self.out_h = 16
+        self.out_w = 8
+        self.scale = 0.3
+        self.out_size = [12, 4, 4]
+        self.align_corners = True
+        self.align_mode = 1
+
+
+# scale is a 1-D tensor
+class TestTrilinearInterp_attr_tensor_Case2(TestTrilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 8, 8, 4]
+        self.out_d = 16
+        self.out_h = 12
+        self.out_w = 4
+        self.scale = 0.
+        self.out_size = [16, 4, 10]
+        self.align_corners = True
+        self.align_mode = 1
+        self.shape_by_1Dtensor = True
+
+
+# scale is a 1-D tensor
+class TestTrilinearInterp_attr_tensor_Case3(TestTrilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 8, 8, 4]
+        self.out_d = 16
+        self.out_h = 16
+        self.out_w = 8
+        self.scale = 2.0
+        self.out_size = None
+        self.align_corners = True
+        self.align_mode = 1
+        self.scale_by_1Dtensor = True
+
+
+class TestTrilinearInterpAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.data(name="x", shape=[2, 3, 6, 9, 4], dtype="float32")
+        y = fluid.data(name="y", shape=[2, 6, 9, 4, 3], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[3], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[3], dtype="int32")
+        scale_tensor = fluid.data(
+            name="scale_tensor", shape=[1], dtype="float32")
+
+        out1 = fluid.layers.resize_trilinear(
+            y, out_shape=[12, 18, 8], data_format='NDHWC')
+        out2 = fluid.layers.resize_trilinear(x, out_shape=[12, dim, 8])
+        out3 = fluid.layers.resize_trilinear(x, out_shape=shape_tensor)
+        out4 = fluid.layers.resize_trilinear(
+            x, out_shape=[4, 4, 8], actual_shape=actual_size)
+        out5 = fluid.layers.resize_trilinear(x, scale=scale_tensor)
+        out6 = interpolate(
+            x, scale_factor=scale_tensor, mode='trilinear', data_format="NCDHW")
+        out7 = interpolate(
+            x, size=[4, 4, 8], mode='trilinear', data_format="NCDHW")
+        out8 = interpolate(
+            x, size=shape_tensor, mode='trilinear', data_format="NCDHW")
+
+        x_data = np.random.random((2, 3, 6, 9, 4)).astype("float32")
+        dim_data = np.array([18]).astype("int32")
+        shape_data = np.array([12, 18, 8]).astype("int32")
+        actual_size_data = np.array([12, 18, 8]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "x": x_data,
+                              "y": np.transpose(x_data, (0, 2, 3, 4, 1)),
+                              "dim": dim_data,
+                              "shape_tensor": shape_data,
+                              "actual_size": actual_size_data,
+                              "scale_tensor": scale_data
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5],
+                          return_numpy=True)
+
+        expect_res = trilinear_interp_np(
+            x_data, out_d=12, out_h=18, out_w=8, align_mode=1)
+        self.assertTrue(
+            np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 4, 1))))
+        for i in range(len(results) - 1):
+            self.assertTrue(np.allclose(results[i + 1], expect_res))
+
+
+class TestTrilinearInterpOpException(unittest.TestCase):
+    def test_exception(self):
+        input = fluid.data(name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
+
+        def attr_data_format():
+            # for 5-D input, data_format only can be NCDHW or NDHWC
+            out = fluid.layers.resize_trilinear(
+                input, out_shape=[4, 8, 4], data_format='NHWC')
+
+        self.assertRaises(ValueError, attr_data_format)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 9a64dd1deea93f473d73d485ec5a9d707aaa54f9..a04aaaef0d41b9f991889586b489269b6ede5b42 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -14,10 +14,14 @@
 
 from __future__ import print_function
 
+import sys
+import subprocess
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid.core as core
+import paddle
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
@@ -231,16 +235,16 @@ class TestUniformRandomOpSelectedRows(unittest.TestCase):
     def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
-
+        paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
             Out="X",
-            shape=[4, 784],
+            shape=[100, 784],
             min=-5.0,
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        self.assertEqual(out.get_tensor().shape(), [100, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -252,19 +256,19 @@ class TestUniformRandomOpSelectedRowsWithDiagInit(
     def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
-
+        paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
             Out="X",
-            shape=[4, 784],
+            shape=[100, 784],
             min=-5.0,
             max=10.0,
             seed=10,
-            diag_num=4,
+            diag_num=100,
             diag_step=784,
             diag_val=1.0)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        self.assertEqual(out.get_tensor().shape(), [100, 784])
         hist, prob = output_hist_diag(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -273,6 +277,7 @@ class TestUniformRandomOpSelectedRowsWithDiagInit(
 
 class TestUniformRandomOpApi(unittest.TestCase):
     def test_api(self):
+        paddle.manual_seed(10)
         x = fluid.layers.data('x', shape=[16], dtype='float32', lod_level=1)
         y = fluid.layers.fc(x,
                             size=16,
@@ -344,12 +349,15 @@ class TestUniformRandomOp_attr_tensor_API(unittest.TestCase):
 
 class TestUniformRandomOp_API_seed(unittest.TestCase):
     def test_attr_tensor_API(self):
+        _seed = 10
+        gen = paddle.manual_seed(_seed)
+        gen._is_init_py = False
         startup_program = fluid.Program()
         train_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
             _min = 5
             _max = 10
-            _seed = 10
+
             ret = fluid.layers.nn.uniform_random(
                 [2, 3, 2], min=_min, max=_max, seed=_seed)
             ret_2 = fluid.layers.nn.uniform_random(
@@ -383,8 +391,8 @@ class TestUniformRandomOpSelectedRowsShapeTensor(unittest.TestCase):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         shape_tensor = scope.var("Shape").get_tensor()
-        shape_tensor.set(np.array([4, 784]).astype("int64"), place)
-
+        shape_tensor.set(np.array([100, 784]).astype("int64"), place)
+        paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
             ShapeTensor="Shape",
@@ -393,7 +401,7 @@ class TestUniformRandomOpSelectedRowsShapeTensor(unittest.TestCase):
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        self.assertEqual(out.get_tensor().shape(), [100, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -415,10 +423,10 @@ class TestUniformRandomOpSelectedRowsShapeTensorList(unittest.TestCase):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         shape_1 = scope.var("shape1").get_tensor()
-        shape_1.set(np.array([4]).astype("int64"), place)
+        shape_1.set(np.array([100]).astype("int64"), place)
         shape_2 = scope.var("shape2").get_tensor()
         shape_2.set(np.array([784]).astype("int64"), place)
-
+        paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
             ShapeTensorList=["shape1", "shape2"],
@@ -427,7 +435,7 @@ class TestUniformRandomOpSelectedRowsShapeTensorList(unittest.TestCase):
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        self.assertEqual(out.get_tensor().shape(), [100, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -452,25 +460,81 @@ class TestUniformRandomBatchSizeLikeOpError(unittest.TestCase):
 
             def test_Variable():
                 x1 = fluid.create_lod_tensor(
-                    np.zeros((4, 784)), [[1, 1, 1, 1]], fluid.CPUPlace())
+                    np.zeros((100, 784)), [[10, 10, 10, 70]], fluid.CPUPlace())
                 fluid.layers.uniform_random_batch_size_like(x1)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_shape():
                 x1 = fluid.layers.data(
-                    name='x2', shape=[4, 784], dtype='float32')
+                    name='x2', shape=[100, 784], dtype='float32')
                 fluid.layers.uniform_random_batch_size_like(x1, shape="shape")
 
             self.assertRaises(TypeError, test_shape)
 
             def test_dtype():
                 x2 = fluid.layers.data(
-                    name='x2', shape=[4, 784], dtype='float32')
+                    name='x2', shape=[100, 784], dtype='float32')
                 fluid.layers.uniform_random_batch_size_like(x2, 'int32')
 
             self.assertRaises(TypeError, test_dtype)
 
 
+class TestUniformAlias(unittest.TestCase):
+    def test_alias(self):
+        paddle.uniform([2, 3], min=-5.0, max=5.0)
+        paddle.tensor.uniform([2, 3], min=-5.0, max=5.0)
+        paddle.tensor.random.uniform([2, 3], min=-5.0, max=5.0)
+
+        def test_uniform_random():
+            paddle.tensor.random.uniform_random([2, 3], min=-5.0, max=5.0)
+
+        self.assertRaises(AttributeError, test_uniform_random)
+
+
+class TestUniformOpError(unittest.TestCase):
+    def test_errors(self):
+        main_prog = Program()
+        start_prog = Program()
+        with program_guard(main_prog, start_prog):
+
+            def test_Variable():
+                x1 = fluid.create_lod_tensor(
+                    np.zeros((100, 784)), [[10, 10, 10, 70]], fluid.CPUPlace())
+                paddle.tensor.random.uniform(x1)
+
+            self.assertRaises(TypeError, test_Variable)
+
+            def test_Variable2():
+                x1 = np.zeros((100, 784))
+                paddle.tensor.random.uniform(x1)
+
+            self.assertRaises(TypeError, test_Variable2)
+
+            def test_dtype():
+                x2 = fluid.layers.data(
+                    name='x2', shape=[100, 784], dtype='float32')
+                paddle.tensor.random.uniform(x2, 'int32')
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_out_dtype():
+                out = paddle.tensor.random.uniform(
+                    shape=[3, 4], dtype='float64')
+                self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+            test_out_dtype()
+
+
+class TestUniformDygraphMode(unittest.TestCase):
+    def test_check_output(self):
+        with fluid.dygraph.guard():
+            x = paddle.tensor.random.uniform(
+                [10], dtype="float32", min=0.0, max=1.0)
+            x_np = x.numpy()
+            for i in range(10):
+                self.assertTrue((x_np[i] > 0 and x_np[i] < 1.0))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unique.py b/python/paddle/fluid/tests/unittests/test_unique.py
index 65194524adfcd7f48efe0f2378c95dba7a8c4a5e..a2c60d870e5e13fd161945fe0abe9b3ab82cc82c 100644
--- a/python/paddle/fluid/tests/unittests/test_unique.py
+++ b/python/paddle/fluid/tests/unittests/test_unique.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
@@ -125,5 +126,185 @@ class TestRandomGPU(TestUniqueOp):
             self.check_output_with_place(place, atol=1e-5)
 
 
+class TestSortedUniqueOp(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64')}
+        unique, indices, inverse, count = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=None)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": None,
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": count,
+        }
+
+
+class TestUniqueOpAxisNone(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.random.random((4, 7, 10)).astype('float64')}
+        unique, indices, inverse, counts = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=None)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": None,
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": counts,
+        }
+
+
+class TestUniqueOpAxis1(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.random.random((3, 8, 8)).astype('float64')}
+        unique, indices, inverse, counts = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=1)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": [1],
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": counts,
+        }
+
+
+class TestUniqueAPI(unittest.TestCase):
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+        x_data = x_data = np.random.randint(0, 10, (120))
+        x = paddle.to_tensor(x_data)
+        out = paddle.unique(x)
+        expected_out = np.unique(x_data)
+        self.assertTrue((out.numpy() == expected_out).all(), True)
+        paddle.enable_static()
+
+    def test_dygraph_api_attr(self):
+        paddle.disable_static()
+        x_data = np.random.random((3, 5, 5)).astype("float32")
+        x = paddle.to_tensor(x_data)
+        out, index, inverse, counts = paddle.unique(
+            x,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=0)
+        np_out, np_index, np_inverse, np_counts = np.unique(
+            x_data,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=0)
+        self.assertTrue((out.numpy() == np_out).all(), True)
+        self.assertTrue((index.numpy() == np_index).all(), True)
+        self.assertTrue((inverse.numpy() == np_inverse).all(), True)
+        self.assertTrue((counts.numpy() == np_counts).all(), True)
+        paddle.enable_static()
+
+    def test_dygraph_attr_dtype(self):
+        paddle.disable_static()
+        x_data = x_data = np.random.randint(0, 10, (120))
+        x = paddle.to_tensor(x_data)
+        out, indices, inverse, counts = paddle.unique(
+            x,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            dtype="int32")
+        expected_out, np_indices, np_inverse, np_counts = np.unique(
+            x_data, return_index=True, return_inverse=True, return_counts=True)
+        self.assertTrue((out.numpy() == expected_out).all(), True)
+        self.assertTrue((indices.numpy() == np_indices).all(), True)
+        self.assertTrue((inverse.numpy() == np_inverse).all(), True)
+        self.assertTrue((counts.numpy() == np_counts).all(), True)
+        paddle.enable_static()
+
+    def test_static_graph(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.data(name='x', shape=[3, 2], dtype='float64')
+            unique, inverse, counts = paddle.unique(
+                x, return_inverse=True, return_counts=True, axis=0)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [1, 2]]).astype('float64')
+            result = exe.run(feed={"x": x_np},
+                             fetch_list=[unique, inverse, counts])
+        np_unique, np_inverse, np_counts = np.unique(
+            x_np, return_inverse=True, return_counts=True, axis=0)
+        self.assertTrue(np.allclose(result[0], np_unique))
+        self.assertTrue(np.allclose(result[1], np_inverse))
+        self.assertTrue(np.allclose(result[2], np_counts))
+
+
+class TestUniqueError(unittest.TestCase):
+    def test_input_dtype(self):
+        def test_x_dtype():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float16')
+                result = paddle.unique(x)
+
+            self.assertRaises(TypeError, test_x_dtype)
+
+    def test_attr(self):
+        x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+
+        def test_return_index():
+            result = paddle.unique(x, return_index=0)
+
+        self.assertRaises(TypeError, test_return_index)
+
+        def test_return_inverse():
+            result = paddle.unique(x, return_inverse='s')
+
+        self.assertRaises(TypeError, test_return_inverse)
+
+        def test_return_counts():
+            result = paddle.unique(x, return_counts=3)
+
+        self.assertRaises(TypeError, test_return_counts)
+
+        def test_axis():
+            result = paddle.unique(x, axis='12')
+
+        def test_dtype():
+            result = paddle.unique(x, dtype='float64')
+
+        self.assertRaises(TypeError, test_axis)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 80b94704c388824901312b5d577cb5cfd0d0c75b..c8383bb950d3ed7b2cdfafa185b0ad156bf7c7bf 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -32,6 +32,30 @@ class TestVarBase(unittest.TestCase):
     def test_to_tensor(self):
         def _test_place(place):
             with fluid.dygraph.guard():
+                paddle.set_default_dtype('float32')
+                x = paddle.to_tensor(1, place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1]))
+                self.assertNotEqual(x.dtype, core.VarDesc.VarType.FP32)
+
+                x = paddle.to_tensor(1.2, place=place, stop_gradient=False)
+                self.assertTrue(
+                    np.array_equal(x.numpy(), np.array([1.2]).astype(
+                        'float32')))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
+
+                x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1 + 2j]))
+                self.assertEqual(x.dtype, 'complex64')
+
+                paddle.set_default_dtype('float64')
+                x = paddle.to_tensor(1.2, place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1.2]))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP64)
+
+                x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1 + 2j]))
+                self.assertEqual(x.dtype, 'complex128')
+
                 x = paddle.to_tensor(
                     1, dtype='float32', place=place, stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1.]))
diff --git a/python/paddle/fluid/tests/unittests/test_variance_layer.py b/python/paddle/fluid/tests/unittests/test_variance_layer.py
index 569f064db8549b5f28bc751a36cbe4b379636379..b5bb3cc978a558bb52f5f56c58f107b653956a75 100644
--- a/python/paddle/fluid/tests/unittests/test_variance_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_variance_layer.py
@@ -15,65 +15,104 @@
 import unittest
 import numpy as np
 import paddle
-import paddle.fluid as fluid
 
 
-class TestVarianceLayer(unittest.TestCase):
+def ref_var(x, axis=None, unbiased=True, keepdim=False):
+    ddof = 1 if unbiased else 0
+    if isinstance(axis, int):
+        axis = (axis, )
+    if axis is not None:
+        axis = tuple(axis)
+    return np.var(x, axis=axis, ddof=ddof, keepdims=keepdim)
+
+
+class TestVarAPI(unittest.TestCase):
     def setUp(self):
-        self._dtype = "float64"
-        self._input = np.random.random([2, 3, 4, 5]).astype(self._dtype)
-
-    def static(self, axis=None, keepdim=False, unbiased=True):
-        prog = fluid.Program()
-        with fluid.program_guard(prog):
-            data = fluid.data(
-                name="data", dtype=self._dtype, shape=[None, 3, 4, 5])
-            out = prog.current_block().create_var(
-                dtype=self._dtype, shape=[2, 3, 4, 5])
-            paddle.var(input=data,
-                       axis=axis,
-                       keepdim=keepdim,
-                       unbiased=unbiased,
-                       out=out)
-
-        exe = fluid.Executor(self._place)
-        return exe.run(feed={"data": self._input},
-                       program=prog,
-                       fetch_list=[out])[0]
-
-    def dynamic(self, axis=None, keepdim=False, unbiased=True):
-        with fluid.dygraph.guard(self._place):
-            data = fluid.dygraph.to_variable(self._input)
-            out = paddle.var(input=data,
-                             axis=axis,
-                             keepdim=keepdim,
-                             unbiased=unbiased)
-            return out.numpy()
-
-    def numpy(self, axis=None, keepdim=False, unbiased=True):
-        ddof = 1 if unbiased else 0
-        axis = tuple(axis) if isinstance(axis, list) else axis
-        return np.var(self._input, axis=axis, keepdims=keepdim, ddof=ddof)
-
-    def test_equal(self):
-        places = [fluid.CPUPlace()]
-        if fluid.core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            self._place = place
-            self.assertTrue(np.allclose(self.numpy(), self.static()))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(axis=[0, 2]), self.dynamic(axis=[0, 2])))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(
-                        axis=[1, 3], keepdim=True),
-                    self.dynamic(
-                        axis=[1, 3], keepdim=True)))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(unbiased=False), self.dynamic(unbiased=False)))
+        self.dtype = 'float64'
+        self.shape = [1, 3, 4, 10]
+        self.axis = [1, 3]
+        self.keepdim = False
+        self.unbiased = True
+        self.set_attrs()
+        self.x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        self.place=paddle.CUDAPlace(0) \
+            if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def set_attrs(self):
+        pass
+
+    def static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.shape, self.dtype)
+            out = paddle.var(x, self.axis, self.unbiased, self.keepdim)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x}, fetch_list=[out])
+        return res[0]
+
+    def dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        out = paddle.var(x, self.axis, self.unbiased, self.keepdim)
+        paddle.enable_static()
+        return out.numpy()
+
+    def test_api(self):
+        out_ref = ref_var(self.x, self.axis, self.unbiased, self.keepdim)
+        out_dygraph = self.dygraph()
+        out_static = self.static()
+        for out in [out_dygraph, out_static]:
+            self.assertTrue(np.allclose(out_ref, out))
+            self.assertTrue(np.equal(out_ref.shape, out.shape).all())
+
+
+class TestVarAPI_dtype(TestVarAPI):
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+
+class TestVarAPI_axis_int(TestVarAPI):
+    def set_attrs(self):
+        self.axis = 2
+
+
+class TestVarAPI_axis_list(TestVarAPI):
+    def set_attrs(self):
+        self.axis = [1, 2]
+
+
+class TestVarAPI_axis_tuple(TestVarAPI):
+    def set_attrs(self):
+        self.axis = (1, 3)
+
+
+class TestVarAPI_keepdim(TestVarAPI):
+    def set_attrs(self):
+        self.keepdim = False
+
+
+class TestVarAPI_unbiased(TestVarAPI):
+    def set_attrs(self):
+        self.unbiased = False
+
+
+class TestVarAPI_alias(unittest.TestCase):
+    def test_alias(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(np.array([10, 12], 'float32'))
+        out1 = paddle.var(x).numpy()
+        out2 = paddle.tensor.var(x).numpy()
+        out3 = paddle.tensor.stat.var(x).numpy()
+        self.assertTrue(np.allclose(out1, out2))
+        self.assertTrue(np.allclose(out1, out3))
+        paddle.enable_static()
+
+
+class TestVarError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [2, 3, 4], 'int32')
+            self.assertRaises(TypeError, paddle.var, x)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 449ac959188949559056654418ace3e227c368da..6bc42f0712a1a8c9f9a0640e06042c42e7cc948f 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -21,25 +21,25 @@ from op_test import OpTest
 from test_softmax_op import stable_softmax
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+import paddle
+import paddle.nn.functional as F
 
 CUDA_BLOCK_SIZE = 512
 
 
 class CTCForward(object):
-    def __init__(self, softmax, softmax_lod, labels, labels_lod, blank,
-                 norm_by_times):
+    def __init__(self, softmax, softmax_lod, labels, labels_lod, num_classes,
+                 batch_size, blank, norm_by_times):
         self.softmax = softmax
         self.softmax_lod = softmax_lod
-        assert labels.shape[1] == 1
         self.labels = labels
         self.labels_lod = labels_lod
         self.blank = blank
         self.norm_by_times = norm_by_times
 
         self.level = 0
-        self.num_classes = softmax.shape[1]
-        self.batch_size = len(softmax_lod[self.level])
-        assert self.batch_size == len(labels_lod[self.level])
+        self.num_classes = num_classes
+        self.batch_size = batch_size
 
         self.loss = np.zeros([self.batch_size, 1], dtype="float32")
         self.gradient = np.zeros(self.softmax.shape, dtype="float32")
@@ -163,17 +163,25 @@ class CTCForward(object):
         softmax_offset = 0
         labels_offset = 0
         for i in range(self.batch_size):
-            softmax_start_i = softmax_offset
-            softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
-            labels_start_i = labels_offset
-            labels_end_i = labels_offset + self.labels_lod[self.level][i]
-
-            softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :]
-            labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
-            self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
-                                                   labels_a_sequence)
-            softmax_offset += self.softmax_lod[self.level][i]
-            labels_offset += self.labels_lod[self.level][i]
+            if self.labels.shape[1] == 1:
+                softmax_start_i = softmax_offset
+                softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
+                labels_start_i = labels_offset
+                labels_end_i = labels_offset + self.labels_lod[self.level][i]
+
+                softmax_a_sequence = self.softmax[softmax_start_i:
+                                                  softmax_end_i, :]
+                labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
+                self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
+                                                       labels_a_sequence)
+                softmax_offset += self.softmax_lod[self.level][i]
+                labels_offset += self.labels_lod[self.level][i]
+            else:
+                softmax_a_sequence = self.softmax[:self.softmax_lod[i], i, :]
+                labels_a_sequence = self.labels[:self.labels_lod[i], :]
+                self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
+                                                       labels_a_sequence)
+
         return self.loss
 
 
@@ -201,7 +209,8 @@ class TestWarpCTCOp(OpTest):
             dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
-                         self.blank, self.norm_by_times)
+                         self.num_classes, self.batch_size, self.blank,
+                         self.norm_by_times)
         loss = ctc.forward()
 
         max_sequence_length = 0
@@ -223,7 +232,7 @@ class TestWarpCTCOp(OpTest):
         }
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
@@ -237,7 +246,7 @@ class TestWarpCTCOpCase1(TestWarpCTCOp):
         self.num_classes = CUDA_BLOCK_SIZE + 2
         self.logits_lod = [[4, 1, 3, 3]]
         self.labels_lod = [[3, 1, 4, 4]]
-        self.blank = 0
+        self.blank = self.num_classes - 1
         self.norm_by_times = False
 
 
@@ -267,7 +276,8 @@ class TestWarpCTCOpWithPadding(OpTest):
             dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
-                         self.blank, self.norm_by_times)
+                         self.num_classes, self.batch_size, self.blank,
+                         self.norm_by_times)
         loss = ctc.forward()
 
         max_sequence_length = 0
@@ -317,7 +327,7 @@ class TestWarpCTCOpWithPadding(OpTest):
         }
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
@@ -333,7 +343,7 @@ class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
         self.labels_lod = [[3, 1, 4, 4]]
         self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
         self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
-        self.blank = 0
+        self.blank = self.num_classes - 1
         self.norm_by_times = False
 
 
@@ -389,5 +399,97 @@ class TestWarpCTCOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_label_len_Variable)
 
 
+class TestCTCLossAPICase(unittest.TestCase):
+    def test_functinal_api(self):
+        self.batch_size = 4
+        self.num_classes = CUDA_BLOCK_SIZE + 2
+        self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
+        self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
+        self.blank = self.num_classes - 1
+        self.norm_by_times = False
+
+        logits = np.random.uniform(0.1, 1.0, [
+            max(self.logits_length), self.batch_size, self.num_classes
+        ]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, -1, logits)
+        # labels should not be blank
+        labels = np.random.randint(
+            0,
+            self.num_classes - 1, [self.batch_size, max(self.labels_length)],
+            dtype="int32")
+
+        ctc = CTCForward(softmax, self.logits_length, labels,
+                         self.labels_length, self.num_classes, self.batch_size,
+                         self.blank, self.norm_by_times)
+        loss_np = ctc.forward()
+
+        paddle.disable_static()
+        softmax = paddle.to_variable(logits)
+        labels = paddle.to_variable(labels)
+        logits_length = paddle.to_variable(self.logits_length)
+        labels_length = paddle.to_variable(self.labels_length)
+        loss_pd_mean = F.ctc_loss(
+            softmax,
+            labels,
+            logits_length,
+            labels_length,
+            blank=self.blank,
+            reduction='mean')
+        loss_pd_mean = loss_pd_mean.numpy()
+
+        loss_pd_sum = F.ctc_loss(
+            softmax,
+            labels,
+            logits_length,
+            labels_length,
+            blank=self.blank,
+            reduction='sum')
+        loss_pd_sum = loss_pd_sum.numpy()
+        paddle.enable_static()
+        loss_np = np.squeeze(loss_np, axis=-1)
+        loss_np_mean = (loss_np / labels_length.numpy()).mean()
+        loss_np_sum = loss_np.sum()
+
+        self.assertTrue(np.allclose(loss_pd_mean, loss_np_mean, atol=1))
+        self.assertTrue(np.allclose(loss_pd_sum, loss_np_sum, atol=1))
+
+    def test_class_api(self):
+        self.batch_size = 3
+        self.num_classes = 15
+        self.logits_length = np.array([3, 3, 3], dtype=np.int64)
+        self.labels_length = np.array([0, 1, 2], dtype=np.int64)
+        self.blank = 0
+        self.norm_by_times = False
+
+        logits = np.random.uniform(0.1, 1.0, [
+            max(self.logits_length), self.batch_size, self.num_classes
+        ]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, -1, logits)
+        # labels should not be blank
+        labels = np.random.randint(
+            1,
+            self.num_classes, [self.batch_size, max(self.labels_length)],
+            dtype="int32")
+
+        ctc = CTCForward(softmax, self.logits_length, labels,
+                         self.labels_length, self.num_classes, self.batch_size,
+                         self.blank, self.norm_by_times)
+        loss_np = ctc.forward()
+
+        paddle.disable_static()
+        softmax = paddle.to_variable(logits)
+        labels = paddle.to_variable(labels)
+        logits_length = paddle.to_variable(self.logits_length)
+        labels_length = paddle.to_variable(self.labels_length)
+
+        loss_pd = paddle.nn.CTCLoss(self.blank, 'none')(
+            softmax, labels, logits_length, labels_length)
+        loss_pd = loss_pd.numpy()
+        paddle.enable_static()
+        loss_np = np.squeeze(loss_np, axis=-1)
+
+        self.assertTrue(np.allclose(loss_pd, loss_np, atol=1))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
index 207ff66a0f877598989e47a8632aa783b53bcc67..ee01bfb21f8206133aa55e8962b4bcc46233085e 100644
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid.layers as layers
 from paddle.fluid.executor import Executor
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
index 4629089e39c9489725340df2172c53ed0661708f..581656f6cd421b12cb4c373bd6d46648704f0c1a 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
@@ -73,6 +73,7 @@ NO_FP64_CHECK_GRAD_OP_LIST = [
     'mish', \
     'transpose2', \
     'trilinear_interp', \
+    'trilinear_interp_v2', \
     'var_conv_2d', \
     'warpctc', \
     'bilateral_slice'
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
index 5300ab935a3405f9f76c08a7f2ece8bad33367ac..47d62999c92d12ab4305272f60c1453cda211b09 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
@@ -15,6 +15,7 @@
 NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
     'affine_channel', \
     'bilinear_interp', \
+    'bilinear_interp_v2',\
     'bilinear_tensor_product', \
     'conv2d', \
     'conv3d', \
@@ -45,4 +46,6 @@ NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
     'cudnn_lstm'
 ]
 
-NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp']
+NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp',\
+                                                'bilinear_interp_v2'
+                                                ]
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index f01dc01973a603a0b6ea08358f73237c68924c78..b2975283fbef010029b935b9b209411f09bdb5fd 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -20,8 +20,8 @@ __all__ = [
 ]
 
 __all__ += [
-    'BackwardStrategy', 'grad', 'LayerList', 'load', 'save', 'prepare_context',
-    'to_variable', 'no_grad', 'ParallelEnv', 'DataParallel'
+    'grad', 'LayerList', 'load', 'save', 'prepare_context', 'to_variable',
+    'no_grad', 'ParallelEnv', 'DataParallel'
 ]
 
 __all__ += [
@@ -50,8 +50,6 @@ from ..fluid.dygraph.base import to_variable  #DEFINE_ALIAS
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import load_dygraph as load  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import save_dygraph as save  #DEFINE_ALIAS
-from ..fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
-from ..fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
 from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
 
 from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
@@ -61,5 +59,3 @@ from ..fluid.dygraph.learning_rate_scheduler import ExponentialDecay  #DEFINE_AL
 from ..fluid.dygraph.learning_rate_scheduler import InverseTimeDecay  #DEFINE_ALIAS
 from ..fluid.dygraph.learning_rate_scheduler import PolynomialDecay  #DEFINE_ALIAS
 from ..fluid.dygraph.learning_rate_scheduler import CosineDecay  #DEFINE_ALIAS
-
-BackwardStrategy = core.BackwardStrategy
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 4d5b2c8e6fcb13826d9e1a0d6738c351be7cf0b1..41ec18ce32d3036c3db86aaa98053f59ff61f717 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -15,6 +15,7 @@
 # TODO: define framework api 
 from paddle.fluid.layer_helper_base import LayerHelperBase
 from paddle.fluid.data_feeder import convert_dtype
+import numpy as np
 
 __all__ = ['set_default_dtype', 'get_default_dtype']
 
@@ -24,7 +25,8 @@ def set_default_dtype(d):
     Set default dtype. The default dtype is initially float32
 
     Args:
-        d(string|np.dtype): the dtype to make the default
+        d(string|np.dtype): the dtype to make the default. It only
+                            supports float16, float32 and float64.
 
     Returns:
         None.
@@ -36,13 +38,35 @@ def set_default_dtype(d):
             paddle.set_default_dtype("float32")
 
     """
-    d = convert_dtype(d)
+    if isinstance(d, type):
+        if d in [np.float16, np.float32, np.float64]:
+            d = d.__name__
+        else:
+            raise TypeError(
+                "set_default_dtype only supports [float16, float32, float64] "
+                ", but received %s" % d.__name__)
+    else:
+        if d in [
+                'float16', 'float32', 'float64', u'float16', u'float32',
+                u'float64'
+        ]:
+            # this code is a little bit dangerous, since error could happen
+            # when casting no-ascii code to str in python2.
+            # but since the set itself is limited, so currently, it is good.
+            # however, jointly supporting python2 and python3, (as well as python4 maybe)
+            # may still be a long-lasting problem.
+            d = str(d)
+        else:
+            raise TypeError(
+                "set_default_dtype only supports [float16, float32, float64] "
+                ", but received %s" % str(d))
+
     LayerHelperBase.set_default_dtype(d)
 
 
 def get_default_dtype():
     """
-    Get the current default dtype. The default dtype is initially float32
+    Get the current default dtype. The default dtype is initially float32.
 
     Args:
         None.
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index 1bb13294805efca319546b432354ce5ba6b1e868..2555d24464112ed8446d863dc8e65cfa37680b36 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -14,28 +14,50 @@
 
 # TODO: define random api
 import paddle.fluid as fluid
+from paddle.fluid import core
 
 __all__ = ['manual_seed']
 
 
 def manual_seed(seed):
     """
-	:alias_main: paddle.manual_seed
-	:alias: paddle.manual_seed,paddle.framework.random.manual_seed
 
-    Set global manual seed for program
+    Sets the seed for global default generator, which manages the random number generation.
 
     Args:
-        manual_seed(int): random seed for program
+        seed(int): The random seed to set. It is recommend to set a large int number.
 
     Returns:
-        None.
+        Generator: The global default generator object.
 
     Examples:
         .. code-block:: python
 
-            from paddle.framework import manual_seed
-            manual_seed(102)
+            import paddle
+            gen = paddle.manual_seed(102)
+
+    """
+    #TODO(zhiqiu): 1. remove program.random_seed when all random-related op upgrade
+    # 2. support gpu generator by global device 
+
+    seed = int(seed)
+
+    core.default_cpu_generator()._is_init_py = True
+    return core.default_cpu_generator().manual_seed(seed)
+
+
+def _manual_program_seed(seed):
+    """
+    Sets global seed for generating random numbers.
+  
+    NOTE(zhiqiu): This is the original implemention of manual_seed. Keeps it temporally 
+    since CUDA generator is not developed, so we need it in the unittest.
+
+    Args:
+        seed(int): The random seed to set. It is recommend to set a large int number.
+    
+    Returns:
+        None
     """
     fluid.default_main_program().random_seed = seed
     fluid.default_startup_program().random_seed = seed
diff --git a/python/paddle/incubate/complex/tensor/math.py b/python/paddle/incubate/complex/tensor/math.py
index 231cbd918281216eae5112c42544c8dfececd9d4..465e4887a1f8a8dc1d53afac8869f0b55776f3d2 100644
--- a/python/paddle/incubate/complex/tensor/math.py
+++ b/python/paddle/incubate/complex/tensor/math.py
@@ -330,8 +330,8 @@ def sum(input, dim=None, keep_dim=False, name=None):
 
     """
     complex_variable_exists([input], "sum")
-    real = math.sum(input.real, dim=dim, keep_dim=keep_dim, name=name)
-    imag = math.sum(input.imag, dim=dim, keep_dim=keep_dim, name=name)
+    real = math.sum(input.real, axis=dim, keepdim=keep_dim, name=name)
+    imag = math.sum(input.imag, axis=dim, keepdim=keep_dim, name=name)
     return ComplexVariable(real, imag)
 
 
diff --git a/python/paddle/incubate/hapi/__init__.py b/python/paddle/incubate/hapi/__init__.py
index a6b5faef57ca95188f0759f53753177e4f5946f3..c0361fa33246ff3315a107c520972ca6bebc8168 100644
--- a/python/paddle/incubate/hapi/__init__.py
+++ b/python/paddle/incubate/hapi/__init__.py
@@ -20,7 +20,6 @@ from . import download
 from . import model
 from .model import *
 
-from . import metrics
 from . import datasets
 from . import distributed
 from . import vision
@@ -39,7 +38,6 @@ __all__ = [
     'datasets',
     'distributed',
     'download',
-    'metrics',
     'vision',
     'text',
     'utils',
diff --git a/python/paddle/incubate/hapi/callbacks.py b/python/paddle/incubate/hapi/callbacks.py
index 741552511f9fdc93d9e370fc7d45f9d84a1d4392..0804708210a9749813e195a8b5579b339986acd6 100644
--- a/python/paddle/incubate/hapi/callbacks.py
+++ b/python/paddle/incubate/hapi/callbacks.py
@@ -295,8 +295,8 @@ class ProgBarLogger(Callback):
             import paddle.fluid as fluid
             import paddle.incubate.hapi as hapi
 
-            inputs = [hapi.Input('image', [-1, 1, 28, 28], 'float32')]
-            labels = [hapi.Input('label', [None, 1], 'int64')]
+            inputs = [hapi.Input([-1, 1, 28, 28], 'float32', 'image')]
+            labels = [hapi.Input([None, 1], 'int64', 'label')]
 
             train_dataset = hapi.datasets.MNIST(mode='train')
 
@@ -305,8 +305,8 @@ class ProgBarLogger(Callback):
 
             optim = fluid.optimizer.Adam(0.001)
             model.prepare(optimizer=optim,
-                        loss_function=paddle.nn.CrossEntropyLoss(),
-                        metrics=hapi.metrics.Accuracy())
+                        loss=paddle.nn.CrossEntropyLoss(),
+                        metrics=paddle.metric.Accuracy())
 
             callback = hapi.callbacks.ProgBarLogger(log_freq=10)
             model.fit(train_dataset, batch_size=64, callbacks=callback)
@@ -431,8 +431,8 @@ class ModelCheckpoint(Callback):
             import paddle.fluid as fluid
             import paddle.incubate.hapi as hapi
 
-            inputs = [hapi.Input('image', [-1, 1, 28, 28], 'float32')]
-            labels = [hapi.Input('label', [None, 1], 'int64')]
+            inputs = [hapi.Input([-1, 1, 28, 28], 'float32', 'image')]
+            labels = [hapi.Input([None, 1], 'int64', 'label')]
 
             train_dataset = hapi.datasets.MNIST(mode='train')
 
@@ -441,8 +441,8 @@ class ModelCheckpoint(Callback):
 
             optim = fluid.optimizer.Adam(0.001)
             model.prepare(optimizer=optim,
-                        loss_function=paddle.nn.CrossEntropyLoss(),
-                        metrics=hapi.metrics.Accuracy())
+                        loss=paddle.nn.CrossEntropyLoss(),
+                        metrics=paddle.metric.Accuracy())
 
             callback = hapi.callbacks.ModelCheckpoint(save_dir='./temp')
             model.fit(train_dataset, batch_size=64, callbacks=callback)
diff --git a/python/paddle/incubate/hapi/datasets/__init__.py b/python/paddle/incubate/hapi/datasets/__init__.py
index fc5df6401992def4bc37329794e534a832924da3..a88b0e6bbf1975d97bfeb68025b978ce877c6baf 100644
--- a/python/paddle/incubate/hapi/datasets/__init__.py
+++ b/python/paddle/incubate/hapi/datasets/__init__.py
@@ -15,11 +15,41 @@
 from . import folder
 from . import mnist
 from . import flowers
+from . import cifar
+from . import voc2012
+from . import conll05
+from . import imdb
+from . import imikolov
+from . import movielens
+from . import movie_reviews
+from . import uci_housing
+from . import wmt14
+from . import wmt16
 
 from .folder import *
 from .mnist import *
 from .flowers import *
+from .cifar import *
+from .voc2012 import *
+from .conll05 import *
+from .imdb import *
+from .imikolov import *
+from .movielens import *
+from .movie_reviews import *
+from .uci_housing import *
+from .wmt14 import *
+from .wmt16 import *
 
 __all__ = folder.__all__ \
-        + mnist.__all__ \
-        + flowers.__all__
+          + mnist.__all__ \
+          + flowers.__all__ \
+          + cifar.__all__ \
+          + voc2012.__all__ \
+          + conll05.__all__ \
+          + imdb.__all__ \
+          + imikolov.__all__ \
+          + movielens.__all__ \
+          + movie_reviews.__all__ \
+          + uci_housing.__all__ \
+          + wmt14.__all__ \
+          + wmt16.__all__
diff --git a/python/paddle/incubate/hapi/datasets/cifar.py b/python/paddle/incubate/hapi/datasets/cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..adfa786e615368ba90dab154924678de79104b55
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/cifar.py
@@ -0,0 +1,207 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import tarfile
+import numpy as np
+import six
+from six.moves import cPickle as pickle
+
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ['Cifar10', 'Cifar100']
+
+URL_PREFIX = 'https://dataset.bj.bcebos.com/cifar/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+MODE_FLAG_MAP = {
+    'train10': 'data_batch',
+    'test10': 'test_batch',
+    'train100': 'train',
+    'test100': 'test'
+}
+
+
+class Cifar10(Dataset):
+    """
+    Implementation of `Cifar-10 <https://www.cs.toronto.edu/~kriz/cifar.html>`_
+    dataset, which has 10 categories.
+
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'test' mode. Default 'train'.
+        transform(callable): transform to perform on image, None for on transform.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of cifar-10 dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Cifar10
+	    from paddle.incubate.hapi.vision.transforms import Normalize
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+		    self.fc = paddle.nn.Linear(3072, 10, act='softmax')
+
+		def forward(self, image, label):
+		    image = paddle.reshape(image, (3, -1))
+		    return self.fc(image), label
+
+	    paddle.disable_static()
+
+	    normalize = Normalize(mean=[0.5, 0.5, 0.5],
+				std=[0.5, 0.5, 0.5])
+	    cifar10 = Cifar10(mode='train', transform=normalize)
+
+	    for i in range(10):
+		image, label = cifar10[i]
+		image = paddle.to_tensor(image)
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		image, label = model(image, label)
+		print(image.numpy().shape, label.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 transform=None,
+                 download=True):
+        assert mode.lower() in ['train', 'test', 'train', 'test'], \
+            "mode should be 'train10', 'test10', 'train100' or 'test100', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self._init_url_md5_flag()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, self.data_url, self.data_md5, 'cifar', download)
+
+        self.transform = transform
+
+        # read dataset into memory
+        self._load_data()
+
+    def _init_url_md5_flag(self):
+        self.data_url = CIFAR10_URL
+        self.data_md5 = CIFAR10_MD5
+        self.flag = MODE_FLAG_MAP[self.mode + '10']
+
+    def _load_data(self):
+        self.data = []
+        with tarfile.open(self.data_file, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if self.flag in each_item.name)
+
+            for name in names:
+                if six.PY2:
+                    batch = pickle.load(f.extractfile(name))
+                else:
+                    batch = pickle.load(f.extractfile(name), encoding='bytes')
+
+                data = batch[six.b('data')]
+                labels = batch.get(
+                    six.b('labels'), batch.get(six.b('fine_labels'), None))
+                assert labels is not None
+                for sample, label in six.moves.zip(data, labels):
+                    self.data.append((sample, label))
+
+    def __getitem__(self, idx):
+        image, label = self.data[idx]
+        if self.transform is not None:
+            image = self.transform(image)
+        return image, label
+
+    def __len__(self):
+        return len(self.data)
+
+
+class Cifar100(Cifar10):
+    """
+    Implementation of `Cifar-100 <https://www.cs.toronto.edu/~kriz/cifar.html>`_
+    dataset, which has 100 categories.
+
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'test' mode. Default 'train'.
+        transform(callable): transform to perform on image, None for on transform.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of cifar-100 dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Cifar100
+	    from paddle.incubate.hapi.vision.transforms import Normalize
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+		    self.fc = paddle.nn.Linear(3072, 100, act='softmax')
+
+		def forward(self, image, label):
+		    image = paddle.reshape(image, (3, -1))
+		    return self.fc(image), label
+
+	    paddle.disable_static()
+
+	    normalize = Normalize(mean=[0.5, 0.5, 0.5],
+				std=[0.5, 0.5, 0.5])
+	    cifar100 = Cifar100(mode='train', transform=normalize)
+
+	    for i in range(10):
+		image, label = cifar100[i]
+		image = paddle.to_tensor(image)
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		image, label = model(image, label)
+		print(image.numpy().shape, label.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 transform=None,
+                 download=True):
+        super(Cifar100, self).__init__(data_file, mode, transform, download)
+
+    def _init_url_md5_flag(self):
+        self.data_url = CIFAR100_URL
+        self.data_md5 = CIFAR100_MD5
+        self.flag = MODE_FLAG_MAP[self.mode + '100']
diff --git a/python/paddle/incubate/hapi/datasets/conll05.py b/python/paddle/incubate/hapi/datasets/conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..094e3559335363524c4ae893f70294a4afaa7037
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/conll05.py
@@ -0,0 +1,297 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import gzip
+import tarfile
+import numpy as np
+import six
+from six.moves import cPickle as pickle
+
+from paddle.io import Dataset
+import paddle.compat as cpt
+from .utils import _check_exists_and_download
+
+__all__ = ['Conll05st']
+
+DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
+DATA_MD5 = '387719152ae52d60422c016e92a742fc'
+WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
+WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
+VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
+VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
+TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
+TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
+EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
+EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
+
+UNK_IDX = 0
+
+
+class Conll05st(Dataset):
+    """
+    Implementation of `Conll05st <https://www.cs.upc.edu/~srlconll/soft.html>`_
+    test dataset.
+
+    Note: only support download test dataset automatically for that
+          only test dataset of Conll05st is public.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        word_dict_file(str): path to word dictionary file, can be set None if
+            :attr:`download` is True. Default None
+        verb_dict_file(str): path to verb dictionary file, can be set None if
+            :attr:`download` is True. Default None
+        target_dict_file(str): path to target dictionary file, can be set None if
+            :attr:`download` is True. Default None
+        emb_file(str): path to embedding dictionary file, only used for
+            :code:`get_embedding` can be set None if :attr:`download` is
+            True. Default None
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` :attr:`word_dict_file` :attr:`verb_dict_file`
+            :attr:`target_dict_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of conll05st dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Conll05st
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, pred_idx, mark, label):
+		    return paddle.sum(pred_idx), paddle.sum(mark), paddle.sum(label)
+
+	    paddle.disable_static()
+
+	    conll05st = Conll05st()
+
+	    for i in range(10):
+		pred_idx, mark, label= conll05st[i][-3:]
+		pred_idx = paddle.to_tensor(pred_idx)
+		mark = paddle.to_tensor(mark)
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		pred_idx, mark, label= model(pred_idx, mark, label)
+		print(pred_idx.numpy(), mark.numpy(), label.numpy())
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 word_dict_file=None,
+                 verb_dict_file=None,
+                 target_dict_file=None,
+                 emb_file=None,
+                 download=True):
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, DATA_URL, DATA_MD5, 'conll05st', download)
+
+        self.word_dict_file = word_dict_file
+        if self.word_dict_file is None:
+            assert download, "word_dict_file is not set and downloading automatically is disabled"
+            self.word_dict_file = _check_exists_and_download(
+                word_dict_file, WORDDICT_URL, WORDDICT_MD5, 'conll05st',
+                download)
+
+        self.verb_dict_file = verb_dict_file
+        if self.verb_dict_file is None:
+            assert download, "verb_dict_file is not set and downloading automatically is disabled"
+            self.verb_dict_file = _check_exists_and_download(
+                verb_dict_file, VERBDICT_URL, VERBDICT_MD5, 'conll05st',
+                download)
+
+        self.target_dict_file = target_dict_file
+        if self.target_dict_file is None:
+            assert download, "target_dict_file is not set and downloading automatically is disabled"
+            self.target_dict_file = _check_exists_and_download(
+                target_dict_file, TRGDICT_URL, TRGDICT_MD5, 'conll05st',
+                download)
+
+        self.word_dict = self._load_dict(self.word_dict_file)
+        self.predicate_dict = self._load_dict(self.verb_dict_file)
+        self.label_dict = self._load_label_dict(self.target_dict_file)
+
+        # read dataset into memory
+        self._load_anno()
+
+    def _load_label_dict(self, filename):
+        d = dict()
+        tag_dict = set()
+        with open(filename, 'r') as f:
+            for i, line in enumerate(f):
+                line = line.strip()
+                if line.startswith("B-"):
+                    tag_dict.add(line[2:])
+                elif line.startswith("I-"):
+                    tag_dict.add(line[2:])
+            index = 0
+            for tag in tag_dict:
+                d["B-" + tag] = index
+                index += 1
+                d["I-" + tag] = index
+                index += 1
+            d["O"] = index
+        return d
+
+    def _load_dict(self, filename):
+        d = dict()
+        with open(filename, 'r') as f:
+            for i, line in enumerate(f):
+                d[line.strip()] = i
+        return d
+
+    def _load_anno(self):
+        tf = tarfile.open(self.data_file)
+        wf = tf.extractfile(
+            "conll05st-release/test.wsj/words/test.wsj.words.gz")
+        pf = tf.extractfile(
+            "conll05st-release/test.wsj/props/test.wsj.props.gz")
+        self.sentences = []
+        self.predicates = []
+        self.labels = []
+        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
+                fileobj=pf) as props_file:
+            sentences = []
+            labels = []
+            one_seg = []
+            for word, label in zip(words_file, props_file):
+                word = cpt.to_text(word.strip())
+                label = cpt.to_text(label.strip().split())
+
+                if len(label) == 0:  # end of sentence
+                    for i in range(len(one_seg[0])):
+                        a_kind_lable = [x[i] for x in one_seg]
+                        labels.append(a_kind_lable)
+
+                    if len(labels) >= 1:
+                        verb_list = []
+                        for x in labels[0]:
+                            if x != '-':
+                                verb_list.append(x)
+
+                        for i, lbl in enumerate(labels[1:]):
+                            cur_tag = 'O'
+                            is_in_bracket = False
+                            lbl_seq = []
+                            verb_word = ''
+                            for l in lbl:
+                                if l == '*' and is_in_bracket == False:
+                                    lbl_seq.append('O')
+                                elif l == '*' and is_in_bracket == True:
+                                    lbl_seq.append('I-' + cur_tag)
+                                elif l == '*)':
+                                    lbl_seq.append('I-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') != -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') == -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = True
+                                else:
+                                    raise RuntimeError('Unexpected label: %s' %
+                                                       l)
+
+                            self.sentences.append(sentences)
+                            self.predicates.append(verb_list[i])
+                            self.labels.append(lbl_seq)
+
+                    sentences = []
+                    labels = []
+                    one_seg = []
+                else:
+                    sentences.append(word)
+                    one_seg.append(label)
+
+        pf.close()
+        wf.close()
+        tf.close()
+
+    def __getitem__(self, idx):
+        sentence = self.sentences[idx]
+        predicate = self.predicates[idx]
+        labels = self.labels[idx]
+
+        sen_len = len(sentence)
+
+        verb_index = labels.index('B-V')
+        mark = [0] * len(labels)
+        if verb_index > 0:
+            mark[verb_index - 1] = 1
+            ctx_n1 = sentence[verb_index - 1]
+        else:
+            ctx_n1 = 'bos'
+
+        if verb_index > 1:
+            mark[verb_index - 2] = 1
+            ctx_n2 = sentence[verb_index - 2]
+        else:
+            ctx_n2 = 'bos'
+
+        mark[verb_index] = 1
+        ctx_0 = sentence[verb_index]
+
+        if verb_index < len(labels) - 1:
+            mark[verb_index + 1] = 1
+            ctx_p1 = sentence[verb_index + 1]
+        else:
+            ctx_p1 = 'eos'
+
+        if verb_index < len(labels) - 2:
+            mark[verb_index + 2] = 1
+            ctx_p2 = sentence[verb_index + 2]
+        else:
+            ctx_p2 = 'eos'
+
+        word_idx = [self.word_dict.get(w, UNK_IDX) for w in sentence]
+
+        ctx_n2_idx = [self.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+        ctx_n1_idx = [self.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+        ctx_0_idx = [self.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+        ctx_p1_idx = [self.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+        ctx_p2_idx = [self.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+        pred_idx = [self.predicate_dict.get(predicate)] * sen_len
+        label_idx = [self.label_dict.get(w) for w in labels]
+
+        return (np.array(word_idx), np.array(ctx_n2_idx), np.array(ctx_n1_idx),
+                np.array(ctx_0_idx), np.array(ctx_p1_idx), np.array(ctx_p2_idx),
+                np.array(pred_idx), np.array(mark), np.array(label_idx))
+
+    def __len__(self):
+        return len(self.sentences)
+
+    def get_dict(self):
+        """
+        Get the word, verb and label dictionary of Wikipedia corpus.
+        """
+        return self.word_dict, self.predicate_dict, self.label_dict
+
+    def get_embedding(self):
+        return self.emb_file
diff --git a/python/paddle/incubate/hapi/datasets/flowers.py b/python/paddle/incubate/hapi/datasets/flowers.py
index 6f56cc82c1cba800002d82cc8a2bd5ddae619f9e..141d2a53b577b8c9be9ac153a36c5b2fa51ded77 100644
--- a/python/paddle/incubate/hapi/datasets/flowers.py
+++ b/python/paddle/incubate/hapi/datasets/flowers.py
@@ -36,12 +36,13 @@ SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
 # In official 'readme', tstid is the flag of test data
 # and trnid is the flag of train data. But test data is more than train data.
 # So we exchange the train data and test data.
-MODE_FLAG_MAP = {'train': 'tstid', 'test': 'trnid', 'valid': "valid"}
+MODE_FLAG_MAP = {'train': 'tstid', 'test': 'trnid', 'valid': 'valid'}
 
 
 class Flowers(Dataset):
     """
-    Implement of flowers dataset
+    Implementation of `Flowers <https://www.robots.ox.ac.uk/~vgg/data/flowers/>`_
+    dataset
 
     Args:
         data_file(str): path to data file, can be set None if
@@ -51,9 +52,9 @@ class Flowers(Dataset):
         setid_file(str): path to subset index file, can be set
             None if :attr:`download` is True. Default None
         mode(str): 'train', 'valid' or 'test' mode. Default 'train'.
-        download(bool): whether auto download mnist dataset if
-            :attr:`image_path`/:attr:`label_path` unset. Default
-            True
+        transform(callable): transform to perform on image, None for on transform.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
 
     Examples:
         
@@ -82,19 +83,19 @@ class Flowers(Dataset):
 
         self.data_file = data_file
         if self.data_file is None:
-            assert download, "data_file not set and auto download disabled"
+            assert download, "data_file is not set and downloading automatically is disabled"
             self.data_file = _check_exists_and_download(
                 data_file, DATA_URL, DATA_MD5, 'flowers', download)
 
         self.label_file = label_file
         if self.label_file is None:
-            assert download, "label_file not set and auto download disabled"
+            assert download, "label_file is not set and downloading automatically is disabled"
             self.label_file = _check_exists_and_download(
                 label_file, LABEL_URL, LABEL_MD5, 'flowers', download)
 
         self.setid_file = setid_file
         if self.setid_file is None:
-            assert download, "setid_file not set and auto download disabled"
+            assert download, "setid_file is not set and downloading automatically is disabled"
             self.setid_file = _check_exists_and_download(
                 setid_file, SETID_URL, SETID_MD5, 'flowers', download)
 
diff --git a/python/paddle/incubate/hapi/datasets/imdb.py b/python/paddle/incubate/hapi/datasets/imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d166bc784a382ac5ae70491d3e8061ad1d1e9f
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/imdb.py
@@ -0,0 +1,144 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import re
+import six
+import string
+import tarfile
+import numpy as np
+import collections
+
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ['Imdb']
+
+URL = 'https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz'
+MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
+
+
+class Imdb(Dataset):
+    """
+    Implementation of `IMDB <https://www.imdb.com/interfaces/>`_ dataset.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' 'test' mode. Default 'train'.
+        cutoff(int): cutoff number for building word dictionary. Default 150.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of IMDB dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Imdb
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, doc, label):
+		    return paddle.sum(doc), label
+
+	    paddle.disable_static()
+
+	    imdb = Imdb(mode='train')
+
+	    for i in range(10):
+		doc, label = imdb[i]
+		doc = paddle.to_tensor(doc)
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		image, label = model(doc, label)
+		print(doc.numpy().shape, label.numpy().shape)
+
+    """
+
+    def __init__(self, data_file=None, mode='train', cutoff=150, download=True):
+        assert mode.lower() in ['train', 'test'], \
+            "mode should be 'train', 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(data_file, URL, MD5,
+                                                        'imdb', download)
+
+        # Build a word dictionary from the corpus
+        self.word_idx = self._build_work_dict(cutoff)
+
+        # read dataset into memory
+        self._load_anno()
+
+    def _build_work_dict(self, cutoff):
+        word_freq = collections.defaultdict(int)
+        pattern = re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$")
+        for doc in self._tokenize(pattern):
+            for word in doc:
+                word_freq[word] += 1
+
+        # Not sure if we should prune less-frequent words here.
+        word_freq = [x for x in six.iteritems(word_freq) if x[1] > cutoff]
+
+        dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+        words, _ = list(zip(*dictionary))
+        word_idx = dict(list(zip(words, six.moves.range(len(words)))))
+        word_idx['<unk>'] = len(words)
+        return word_idx
+
+    def _tokenize(self, pattern):
+        data = []
+        with tarfile.open(self.data_file) as tarf:
+            tf = tarf.next()
+            while tf != None:
+                if bool(pattern.match(tf.name)):
+                    # newline and punctuations removal and ad-hoc tokenization.
+                    data.append(
+                        tarf.extractfile(tf).read().rstrip(six.b("\n\r"))
+                        .translate(None, six.b(string.punctuation)).lower(
+                        ).split())
+                tf = tarf.next()
+
+        return data
+
+    def _load_anno(self):
+        pos_pattern = re.compile("aclImdb/{}/pos/.*\.txt$".format(self.mode))
+        neg_pattern = re.compile("aclImdb/{}/neg/.*\.txt$".format(self.mode))
+
+        UNK = self.word_idx['<unk>']
+
+        self.docs = []
+        self.labels = []
+        for doc in self._tokenize(pos_pattern):
+            self.docs.append([self.word_idx.get(w, UNK) for w in doc])
+            self.labels.append(0)
+        for doc in self._tokenize(neg_pattern):
+            self.docs.append([self.word_idx.get(w, UNK) for w in doc])
+            self.labels.append(1)
+
+    def __getitem__(self, idx):
+        return (np.array(self.docs[idx]), np.array([self.labels[idx]]))
+
+    def __len__(self):
+        return len(self.docs)
diff --git a/python/paddle/incubate/hapi/datasets/imikolov.py b/python/paddle/incubate/hapi/datasets/imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e6ad43b506265ee8c9c8617a87eba5a041632bd
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/imikolov.py
@@ -0,0 +1,171 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import six
+import tarfile
+import numpy as np
+import collections
+
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ['Imikolov']
+
+URL = 'https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgz'
+MD5 = '30177ea32e27c525793142b6bf2c8e2d'
+
+
+class Imikolov(Dataset):
+    """
+    Implementation of imikolov dataset.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        data_type(str): 'NGRAM' or 'SEQ'. Default 'NGRAM'.
+        window_size(int): sliding window size for 'NGRAM' data. Default -1.
+        mode(str): 'train' 'test' mode. Default 'train'.
+        min_word_freq(int): minimal word frequence for building word dictionary. Default 50.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of imikolov dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Imikolov
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, src, trg):
+		    return paddle.sum(src), paddle.sum(trg)
+
+	    paddle.disable_static()
+
+	    imikolov = Imikolov(mode='train', data_type='SEQ', window_size=2)
+
+	    for i in range(10):
+		src, trg = imikolov[i]
+		src = paddle.to_tensor(src)
+		trg = paddle.to_tensor(trg)
+
+		model = SimpleNet()
+		src, trg = model(src, trg)
+		print(src.numpy().shape, trg.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 data_type='NGRAM',
+                 window_size=-1,
+                 mode='train',
+                 min_word_freq=50,
+                 download=True):
+        assert data_type.upper() in ['NGRAM', 'SEQ'], \
+            "data type should be 'NGRAM', 'SEQ', but got {}".format(data_type)
+        self.data_type = data_type.upper()
+
+        assert mode.lower() in ['train', 'test'], \
+            "mode should be 'train', 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.window_size = window_size
+        self.min_word_freq = min_word_freq
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically disabled"
+            self.data_file = _check_exists_and_download(data_file, URL, MD5,
+                                                        'imikolov', download)
+
+        # Build a word dictionary from the corpus
+        self.word_idx = self._build_work_dict(min_word_freq)
+
+        # read dataset into memory
+        self._load_anno()
+
+    def word_count(self, f, word_freq=None):
+        if word_freq is None:
+            word_freq = collections.defaultdict(int)
+
+        for l in f:
+            for w in l.strip().split():
+                word_freq[w] += 1
+            word_freq['<s>'] += 1
+            word_freq['<e>'] += 1
+
+        return word_freq
+
+    def _build_work_dict(self, cutoff):
+        train_filename = './simple-examples/data/ptb.train.txt'
+        test_filename = './simple-examples/data/ptb.valid.txt'
+        with tarfile.open(self.data_file) as tf:
+            trainf = tf.extractfile(train_filename)
+            testf = tf.extractfile(test_filename)
+            word_freq = self.word_count(testf, self.word_count(trainf))
+            if '<unk>' in word_freq:
+                # remove <unk> for now, since we will set it as last index
+                del word_freq['<unk>']
+
+            word_freq = [
+                x for x in six.iteritems(word_freq) if x[1] > self.min_word_freq
+            ]
+
+            word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+            words, _ = list(zip(*word_freq_sorted))
+            word_idx = dict(list(zip(words, six.moves.range(len(words)))))
+            word_idx['<unk>'] = len(words)
+
+        return word_idx
+
+    def _load_anno(self):
+        self.data = []
+        with tarfile.open(self.data_file) as tf:
+            filename = './simple-examples/data/ptb.{}.txt'.format(self.mode)
+            f = tf.extractfile(filename)
+
+            UNK = self.word_idx['<unk>']
+            for l in f:
+                if self.data_type == 'NGRAM':
+                    assert self.window_size > -1, 'Invalid gram length'
+                    l = ['<s>'] + l.strip().split() + ['<e>']
+                    if len(l) >= self.window_size:
+                        l = [self.word_idx.get(w, UNK) for w in l]
+                        for i in six.moves.range(self.window_size, len(l) + 1):
+                            self.data.append(tuple(l[i - self.window_size:i]))
+                elif self.data_type == 'SEQ':
+                    l = l.strip().split()
+                    l = [self.word_idx.get(w, UNK) for w in l]
+                    src_seq = [self.word_idx['<s>']] + l
+                    trg_seq = l + [self.word_idx['<e>']]
+                    if self.window_size > 0 and len(src_seq) > self.window_size:
+                        continue
+                    self.data.append((src_seq, trg_seq))
+                else:
+                    assert False, 'Unknow data type'
+
+    def __getitem__(self, idx):
+        return tuple([np.array(d) for d in self.data[idx]])
+
+    def __len__(self):
+        return len(self.data)
diff --git a/python/paddle/incubate/hapi/datasets/mnist.py b/python/paddle/incubate/hapi/datasets/mnist.py
index bd48ca1c9668b40ac0379bfeda11a5c056f9fd44..ed046e5a1d9bbcc33f3148c6ecde8a349e478cb0 100644
--- a/python/paddle/incubate/hapi/datasets/mnist.py
+++ b/python/paddle/incubate/hapi/datasets/mnist.py
@@ -38,7 +38,7 @@ TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
 
 class MNIST(Dataset):
     """
-    Implement of MNIST dataset
+    Implementation of `MNIST <http://yann.lecun.com/exdb/mnist/>`_ dataset
 
     Args:
         image_path(str): path to image file, can be set None if
@@ -48,9 +48,8 @@ class MNIST(Dataset):
         chw_format(bool): If set True, the output shape is [1, 28, 28],
             otherwise, output shape is [1, 784]. Default True.
         mode(str): 'train' or 'test' mode. Default 'train'.
-        download(bool): whether auto download mnist dataset if
-            :attr:`image_path`/:attr:`label_path` unset. Default
-            True
+        download(bool): whether to download dataset automatically if
+            :attr:`image_path` :attr:`label_path` is not set. Default True
 
     Returns:
         Dataset: MNIST Dataset.
@@ -82,7 +81,7 @@ class MNIST(Dataset):
         self.chw_format = chw_format
         self.image_path = image_path
         if self.image_path is None:
-            assert download, "image_path not set and auto download disabled"
+            assert download, "image_path is not set and downloading automatically is disabled"
             image_url = TRAIN_IMAGE_URL if mode == 'train' else TEST_IMAGE_URL
             image_md5 = TRAIN_IMAGE_MD5 if mode == 'train' else TEST_IMAGE_MD5
             self.image_path = _check_exists_and_download(
@@ -90,9 +89,9 @@ class MNIST(Dataset):
 
         self.label_path = label_path
         if self.label_path is None:
-            assert download, "label_path not set and auto download disabled"
-            label_url = TRAIN_LABEL_URL if mode == 'train' else TEST_LABEL_URL
-            label_md5 = TRAIN_LABEL_MD5 if mode == 'train' else TEST_LABEL_MD5
+            assert download, "label_path is not set and downloading automatically is disabled"
+            label_url = TRAIN_LABEL_URL if self.mode == 'train' else TEST_LABEL_URL
+            label_md5 = TRAIN_LABEL_MD5 if self.mode == 'train' else TEST_LABEL_MD5
             self.label_path = _check_exists_and_download(
                 label_path, label_url, label_md5, 'mnist', download)
 
diff --git a/python/paddle/incubate/hapi/datasets/movie_reviews.py b/python/paddle/incubate/hapi/datasets/movie_reviews.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bf0684ebcd315807b9dc736c5481383073e5ba8
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/movie_reviews.py
@@ -0,0 +1,173 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import six
+import numpy as np
+import collections
+import nltk
+from nltk.corpus import movie_reviews
+import zipfile
+from functools import cmp_to_key
+from itertools import chain
+
+import paddle
+from paddle.io import Dataset
+
+__all__ = ['MovieReviews']
+
+URL = "https://corpora.bj.bcebos.com/movie_reviews%2Fmovie_reviews.zip"
+MD5 = '155de2b77c6834dd8eea7cbe88e93acb'
+
+NUM_TRAINING_INSTANCES = 1600
+NUM_TOTAL_INSTANCES = 2000
+
+
+class MovieReviews(Dataset):
+    """
+    Implementation of `NLTK movie reviews <http://www.nltk.org/nltk_data/>`_ dataset.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' 'test' mode. Default 'train'.
+        download(bool): whether auto download cifar dataset if
+            :attr:`data_file` unset. Default True.
+
+    Returns:
+        Dataset: instance of movie reviews dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import MovieReviews
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, word, category):
+		    return paddle.sum(word), category
+
+	    paddle.disable_static()
+
+	    movie_reviews = MovieReviews(mode='train')
+
+	    for i in range(10):
+		word_list, category = movie_reviews[i]
+		word_list = paddle.to_tensor(word_list)
+		category = paddle.to_tensor(category)
+
+		model = SimpleNet()
+		word_list, category = model(word_list, category)
+		print(word_list.numpy().shape, category.numpy())
+
+    """
+
+    def __init__(self, mode='train'):
+        assert mode.lower() in ['train', 'test'], \
+            "mode should be 'train', 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self._download_data_if_not_yet()
+
+        # read dataset into memory
+        self._load_sentiment_data()
+
+    def _get_word_dict(self):
+        """
+	Sorted the words by the frequency of words which occur in sample
+	:return:
+	    words_freq_sorted
+	"""
+        words_freq_sorted = list()
+        word_freq_dict = collections.defaultdict(int)
+
+        for category in movie_reviews.categories():
+            for field in movie_reviews.fileids(category):
+                for words in movie_reviews.words(field):
+                    word_freq_dict[words] += 1
+        words_sort_list = list(six.iteritems(word_freq_dict))
+        words_sort_list.sort(key=cmp_to_key(lambda a, b: b[1] - a[1]))
+        for index, word in enumerate(words_sort_list):
+            words_freq_sorted.append((word[0], index))
+        return words_freq_sorted
+
+    def _sort_files(self):
+        """
+	Sorted the sample for cross reading the sample
+	:return:
+	    files_list
+	"""
+        files_list = list()
+        neg_file_list = movie_reviews.fileids('neg')
+        pos_file_list = movie_reviews.fileids('pos')
+        files_list = list(
+            chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
+        return files_list
+
+    def _load_sentiment_data(self):
+        """
+	Load the data set
+	:return:
+	    data_set
+	"""
+        self.data = []
+        words_ids = dict(self._get_word_dict())
+        for sample_file in self._sort_files():
+            words_list = list()
+            category = 0 if 'neg' in sample_file else 1
+            for word in movie_reviews.words(sample_file):
+                words_list.append(words_ids[word.lower()])
+            self.data.append((words_list, category))
+
+    def _download_data_if_not_yet(self):
+        """
+	Download the data set, if the data set is not download.
+	"""
+        try:
+            # download and extract movie_reviews.zip
+            paddle.dataset.common.download(
+                URL, 'corpora', md5sum=MD5, save_name='movie_reviews.zip')
+            path = os.path.join(paddle.dataset.common.DATA_HOME, 'corpora')
+            filename = os.path.join(path, 'movie_reviews.zip')
+            zip_file = zipfile.ZipFile(filename)
+            zip_file.extractall(path)
+            zip_file.close()
+            # make sure that nltk can find the data
+            if paddle.dataset.common.DATA_HOME not in nltk.data.path:
+                nltk.data.path.append(paddle.dataset.common.DATA_HOME)
+            movie_reviews.categories()
+        except LookupError:
+            print("Downloading movie_reviews data set, please wait.....")
+            nltk.download(
+                'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
+            print("Download data set success.....")
+            print("Path is " + nltk.data.find('corpora/movie_reviews').path)
+
+    def __getitem__(self, idx):
+        if self.mode == 'test':
+            idx += NUM_TRAINING_INSTANCES
+        data = self.data[idx]
+        return np.array(data[0]), np.array(data[1])
+
+    def __len__(self):
+        if self.mode == 'train':
+            return NUM_TRAINING_INSTANCES
+        else:
+            return NUM_TOTAL_INSTANCES - NUM_TRAINING_INSTANCES
diff --git a/python/paddle/incubate/hapi/datasets/movielens.py b/python/paddle/incubate/hapi/datasets/movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..228e9dc6d477cf539683963dc6ddaa3c02c8fe95
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/movielens.py
@@ -0,0 +1,219 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import zipfile
+import re
+import random
+import functools
+import six
+
+import paddle
+from paddle.io import Dataset
+import paddle.compat as cpt
+from .utils import _check_exists_and_download
+
+__all__ = ['Movielens']
+
+age_table = [1, 18, 25, 35, 45, 50, 56]
+
+URL = 'https://dataset.bj.bcebos.com/movielens%2Fml-1m.zip'
+MD5 = 'c4d9eecfca2ab87c1945afe126590906'
+
+
+class MovieInfo(object):
+    """
+    Movie id, title and categories information are stored in MovieInfo.
+    """
+
+    def __init__(self, index, categories, title):
+        self.index = int(index)
+        self.categories = categories
+        self.title = title
+
+    def value(self, categories_dict, movie_title_dict):
+        """
+        Get information from a movie.
+        """
+        return [[self.index], [categories_dict[c] for c in self.categories],
+                [movie_title_dict[w.lower()] for w in self.title.split()]]
+
+    def __str__(self):
+        return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
+            self.index, self.title, self.categories)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class UserInfo(object):
+    """
+    User id, gender, age, and job information are stored in UserInfo.
+    """
+
+    def __init__(self, index, gender, age, job_id):
+        self.index = int(index)
+        self.is_male = gender == 'M'
+        self.age = age_table.index(int(age))
+        self.job_id = int(job_id)
+
+    def value(self):
+        """
+        Get information from a user.
+        """
+        return [[self.index], [0 if self.is_male else 1], [self.age],
+                [self.job_id]]
+
+    def __str__(self):
+        return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
+            self.index, "M"
+            if self.is_male else "F", age_table[self.age], self.job_id)
+
+    def __repr__(self):
+        return str(self)
+
+
+class Movielens(Dataset):
+    """
+    Implementation of `Movielens 1-M <https://grouplens.org/datasets/movielens/1m/>`_ dataset.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' or 'test' mode. Default 'train'.
+        test_ratio(float): split ratio for test sample. Default 0.1.
+        rand_seed(int): random seed. Default 0.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of Movielens 1-M dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Movielens
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, category, title, rating):
+		    return paddle.sum(category), paddle.sum(title), paddle.sum(rating)
+
+	    paddle.disable_static()
+
+	    movielens = Movielens(mode='train')
+
+	    for i in range(10):
+		category, title, rating = movielens[i][-3:]
+		category = paddle.to_tensor(category)
+		title = paddle.to_tensor(title)
+		rating = paddle.to_tensor(rating)
+
+		model = SimpleNet()
+		category, title, rating = model(category, title, rating)
+		print(category.numpy().shape, title.numpy().shape, rating.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 test_ratio=0.1,
+                 rand_seed=0,
+                 download=True):
+        assert mode.lower() in ['train', 'test'], \
+            "mode should be 'train', 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(data_file, URL, MD5,
+                                                        'sentiment', download)
+
+        self.test_ratio = test_ratio
+        self.rand_seed = rand_seed
+
+        np.random.seed(rand_seed)
+        self._load_meta_info()
+        self._load_data()
+
+    def _load_meta_info(self):
+        pattern = re.compile(r'^(.*)\((\d+)\)$')
+        self.movie_info = dict()
+        self.movie_title_dict = dict()
+        self.categories_dict = dict()
+        self.user_info = dict()
+        with zipfile.ZipFile(self.data_file) as package:
+            for info in package.infolist():
+                assert isinstance(info, zipfile.ZipInfo)
+                title_word_set = set()
+                categories_set = set()
+                with package.open('ml-1m/movies.dat') as movie_file:
+                    for i, line in enumerate(movie_file):
+                        line = cpt.to_text(line, encoding='latin')
+                        movie_id, title, categories = line.strip().split('::')
+                        categories = categories.split('|')
+                        for c in categories:
+                            categories_set.add(c)
+                        title = pattern.match(title).group(1)
+                        self.movie_info[int(movie_id)] = MovieInfo(
+                            index=movie_id, categories=categories, title=title)
+                        for w in title.split():
+                            title_word_set.add(w.lower())
+
+                for i, w in enumerate(title_word_set):
+                    self.movie_title_dict[w] = i
+
+                for i, c in enumerate(categories_set):
+                    self.categories_dict[c] = i
+
+                with package.open('ml-1m/users.dat') as user_file:
+                    for line in user_file:
+                        line = cpt.to_text(line, encoding='latin')
+                        uid, gender, age, job, _ = line.strip().split("::")
+                        self.user_info[int(uid)] = UserInfo(
+                            index=uid, gender=gender, age=age, job_id=job)
+
+    def _load_data(self):
+        self.data = []
+        is_test = self.mode == 'test'
+        with zipfile.ZipFile(self.data_file) as package:
+            with package.open('ml-1m/ratings.dat') as rating:
+                for line in rating:
+                    line = cpt.to_text(line, encoding='latin')
+                    if (np.random.random() < self.test_ratio) == is_test:
+                        uid, mov_id, rating, _ = line.strip().split("::")
+                        uid = int(uid)
+                        mov_id = int(mov_id)
+                        rating = float(rating) * 2 - 5.0
+
+                        mov = self.movie_info[mov_id]
+                        usr = self.user_info[uid]
+                        self.data.append(usr.value() + \
+                                         mov.value(self.categories_dict, self.movie_title_dict) + \
+                                         [[rating]])
+
+    def __getitem__(self, idx):
+        data = self.data[idx]
+        return tuple([np.array(d) for d in data])
+
+    def __len__(self):
+        return len(self.data)
diff --git a/python/paddle/incubate/hapi/datasets/uci_housing.py b/python/paddle/incubate/hapi/datasets/uci_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1f2c4a5bb5d9d60ba1316e3e2a5f174df94fe99
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/uci_housing.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import six
+import numpy as np
+
+import paddle.dataset.common
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ["UCIHousing"]
+
+URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data'
+MD5 = 'd4accdce7a25600298819f8e28e8d593'
+feature_names = [
+    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
+    'PTRATIO', 'B', 'LSTAT'
+]
+
+
+class UCIHousing(Dataset):
+    """
+    Implementation of `UCI housing <https://archive.ics.uci.edu/ml/datasets/Housing>`_
+    dataset
+
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' or 'test' mode. Default 'train'.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of UCI housing dataset.
+
+    Examples:
+        
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import UCIHousing
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, feature, target):
+		    return paddle.sum(feature), target
+
+	    paddle.disable_static()
+
+	    uci_housing = UCIHousing(mode='train')
+
+	    for i in range(10):
+		feature, target = uci_housing[i]
+		feature = paddle.to_tensor(feature)
+		target = paddle.to_tensor(target)
+
+		model = SimpleNet()
+		feature, target = model(feature, target)
+		print(feature.numpy().shape, target.numpy())
+
+    """
+
+    def __init__(self, data_file=None, mode='train', download=True):
+        assert mode.lower() in ['train', 'test'], \
+                "mode should be 'train' or 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(data_file, URL, MD5,
+                                                        'uci_housing', download)
+
+        # read dataset into memory
+        self._load_data()
+
+    def _load_data(self, feature_num=14, ratio=0.8):
+        data = np.fromfile(self.data_file, sep=' ')
+        data = data.reshape(data.shape[0] // feature_num, feature_num)
+        maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
+            axis=0) / data.shape[0]
+        for i in six.moves.range(feature_num - 1):
+            data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
+        offset = int(data.shape[0] * ratio)
+        if self.mode == 'train':
+            self.data = data[:offset]
+        elif self.mode == 'test':
+            self.data = data[offset:]
+
+    def __getitem__(self, idx):
+        data = self.data[idx]
+        return np.array(data[:-1]), np.array(data[-1:])
+
+    def __len__(self):
+        return len(self.data)
diff --git a/python/paddle/incubate/hapi/datasets/voc2012.py b/python/paddle/incubate/hapi/datasets/voc2012.py
new file mode 100644
index 0000000000000000000000000000000000000000..1811c455db530710a0559c077975ab08d6a94ac3
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/voc2012.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import io
+import tarfile
+import numpy as np
+from PIL import Image
+
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ["VOC2012"]
+
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
+VOCtrainval_11-May-2012.tar'
+
+VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
+SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
+DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
+LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
+
+CACHE_DIR = 'voc2012'
+
+MODE_FLAG_MAP = {'train': 'trainval', 'test': 'train', 'valid': "val"}
+
+
+class VOC2012(Dataset):
+    """
+    Implementation of `VOC2012 <http://host.robots.ox.ac.uk/pascal/VOC/voc2012/>`_ dataset
+
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'valid' or 'test' mode. Default 'train'.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import VOC2012
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, image, label):
+		    return paddle.sum(image), label
+
+	    paddle.disable_static()
+
+	    voc2012 = VOC2012(mode='train')
+
+	    for i in range(10):
+		image, label= voc2012[i]
+		image = paddle.cast(paddle.to_tensor(image), 'float32')
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		image, label= model(image, label)
+		print(image.numpy().shape, label.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 transform=None,
+                 download=True):
+        assert mode.lower() in ['train', 'valid', 'test'], \
+            "mode should be 'train', 'valid' or 'test', but got {}".format(mode)
+        self.flag = MODE_FLAG_MAP[mode.lower()]
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, VOC_URL, VOC_MD5, CACHE_DIR, download)
+        self.transform = transform
+
+        # read dataset into memory
+        self._load_anno()
+
+    def _load_anno(self):
+        self.name2mem = {}
+        self.data_tar = tarfile.open(self.data_file)
+        for ele in self.data_tar.getmembers():
+            self.name2mem[ele.name] = ele
+
+        set_file = SET_FILE.format(self.flag)
+        sets = self.data_tar.extractfile(self.name2mem[set_file])
+
+        self.data = []
+        self.labels = []
+
+        for line in sets:
+            line = line.strip()
+            data = DATA_FILE.format(line.decode('utf-8'))
+            label = LABEL_FILE.format(line.decode('utf-8'))
+            self.data.append(data)
+            self.labels.append(label)
+
+    def __getitem__(self, idx):
+        data_file = self.data[idx]
+        label_file = self.labels[idx]
+
+        data = self.data_tar.extractfile(self.name2mem[data_file]).read()
+        label = self.data_tar.extractfile(self.name2mem[label_file]).read()
+        data = Image.open(io.BytesIO(data))
+        label = Image.open(io.BytesIO(label))
+        data = np.array(data)
+        label = np.array(label)
+        if self.transform is not None:
+            data = self.transform(data)
+        return data, label
+
+    def __len__(self):
+        return len(self.data)
+
+    def __del__(self):
+        if self.data_tar:
+            self.data_tar.close()
diff --git a/python/paddle/incubate/hapi/datasets/wmt14.py b/python/paddle/incubate/hapi/datasets/wmt14.py
new file mode 100644
index 0000000000000000000000000000000000000000..b495ea931a80425b8e24b81cdf8fdfd2c0920a3e
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/wmt14.py
@@ -0,0 +1,179 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import tarfile
+import numpy as np
+import gzip
+
+from paddle.io import Dataset
+import paddle.compat as cpt
+from .utils import _check_exists_and_download
+
+__all__ = ['WMT14']
+
+URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
+                'cslm_joint_paper/data/dev+test.tgz')
+MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
+# this is a small set of data for test. The original data is too large and
+# will be add later.
+URL_TRAIN = ('http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz')
+MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
+
+START = "<s>"
+END = "<e>"
+UNK = "<unk>"
+UNK_IDX = 2
+
+
+class WMT14(Dataset):
+    """
+    Implementation of `WMT14 <http://www.statmt.org/wmt14/>`_ test dataset.
+    The original WMT14 dataset is too large and a small set of data for set is
+    provided. This module will download dataset from
+    http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'test' or 'gen'. Default 'train'
+        dict_size(int): word dictionary size. Default -1.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of WMT14 dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import WMT14
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, src_ids, trg_ids, trg_ids_next):
+		    return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)
+
+	    paddle.disable_static()
+
+	    wmt14 = WMT14(mode='train', dict_size=50)
+
+	    for i in range(10):
+		src_ids, trg_ids, trg_ids_next = wmt14[i]
+		src_ids = paddle.to_tensor(src_ids)
+		trg_ids = paddle.to_tensor(trg_ids)
+		trg_ids_next = paddle.to_tensor(trg_ids_next)
+
+		model = SimpleNet()
+		src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
+		print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy())
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 dict_size=-1,
+                 download=True):
+        assert mode.lower() in ['train', 'test', 'gen'], \
+            "mode should be 'train', 'test' or 'gen', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, URL_TRAIN, MD5_TRAIN, 'wmt14', download)
+
+        # read dataset into memory
+        assert dict_size > 0, "dict_size should be set as positive number"
+        self.dict_size = dict_size
+        self._load_data()
+
+    def _load_data(self):
+        def __to_dict(fd, size):
+            out_dict = dict()
+            for line_count, line in enumerate(fd):
+                if line_count < size:
+                    out_dict[cpt.to_text(line.strip())] = line_count
+                else:
+                    break
+            return out_dict
+
+        self.src_ids = []
+        self.trg_ids = []
+        self.trg_ids_next = []
+        with tarfile.open(self.data_file, mode='r') as f:
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith("src.dict")
+            ]
+            assert len(names) == 1
+            self.src_dict = __to_dict(f.extractfile(names[0]), self.dict_size)
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith("trg.dict")
+            ]
+            assert len(names) == 1
+            self.trg_dict = __to_dict(f.extractfile(names[0]), self.dict_size)
+
+            file_name = "{}/{}".format(self.mode, self.mode)
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith(file_name)
+            ]
+            for name in names:
+                for line in f.extractfile(name):
+                    line = cpt.to_text(line)
+                    line_split = line.strip().split('\t')
+                    if len(line_split) != 2:
+                        continue
+                    src_seq = line_split[0]  # one source sequence
+                    src_words = src_seq.split()
+                    src_ids = [
+                        self.src_dict.get(w, UNK_IDX)
+                        for w in [START] + src_words + [END]
+                    ]
+
+                    trg_seq = line_split[1]  # one target sequence
+                    trg_words = trg_seq.split()
+                    trg_ids = [self.trg_dict.get(w, UNK_IDX) for w in trg_words]
+
+                    # remove sequence whose length > 80 in training mode
+                    if len(src_ids) > 80 or len(trg_ids) > 80:
+                        continue
+                    trg_ids_next = trg_ids + [self.trg_dict[END]]
+                    trg_ids = [self.trg_dict[START]] + trg_ids
+
+                    self.src_ids.append(src_ids)
+                    self.trg_ids.append(trg_ids)
+                    self.trg_ids_next.append(trg_ids_next)
+
+    def __getitem__(self, idx):
+        return (np.array(self.src_ids[idx]), np.array(self.trg_ids[idx]),
+                np.array(self.trg_ids_next[idx]))
+
+    def __len__(self):
+        return len(self.src_ids)
+
+    def get_dict(self, reverse=False):
+        if reverse:
+            src_dict = {v: k for k, v in six.iteritems(src_dict)}
+            trg_dict = {v: k for k, v in six.iteritems(trg_dict)}
+        return src_dict, trg_dict
diff --git a/python/paddle/incubate/hapi/datasets/wmt16.py b/python/paddle/incubate/hapi/datasets/wmt16.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3cb8bfacadd15f6c0f973a09dbf544bbc396c0
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/wmt16.py
@@ -0,0 +1,247 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+"""
+
+from __future__ import print_function
+
+import os
+import six
+import tarfile
+import numpy as np
+from collections import defaultdict
+
+import paddle
+from paddle.io import Dataset
+import paddle.compat as cpt
+from .utils import _check_exists_and_download
+
+__all__ = ['WMT16']
+
+DATA_URL = ("http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gz")
+DATA_MD5 = "0c38be43600334966403524a40dcd81e"
+
+TOTAL_EN_WORDS = 11250
+TOTAL_DE_WORDS = 19220
+
+START_MARK = "<s>"
+END_MARK = "<e>"
+UNK_MARK = "<unk>"
+
+
+class WMT16(Dataset):
+    """
+    Implementation of `WMT16 <http://www.statmt.org/wmt16/>`_ test dataset.
+    ACL2016 Multimodal Machine Translation. Please see this website for more
+    details: http://www.statmt.org/wmt16/multimodal-task.html#task1
+
+    If you use the dataset created for your task, please cite the following paper:
+    Multi30K: Multilingual English-German Image Descriptions.
+
+    .. code-block:: text
+
+        @article{elliott-EtAl:2016:VL16,
+         author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
+         title     = {Multi30K: Multilingual English-German Image Descriptions},
+         booktitle = {Proceedings of the 6th Workshop on Vision and Language},
+         year      = {2016},
+         pages     = {70--74},
+         year      = 2016
+        }
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'test' or 'val'. Default 'train'
+        src_dict_size(int): word dictionary size for source language word. Default -1.
+        trg_dict_size(int): word dictionary size for target language word. Default -1.
+        lang(str): source language, 'en' or 'de'. Default 'en'.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of WMT16 dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import WMT16
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, src_ids, trg_ids, trg_ids_next):
+		    return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)
+
+	    paddle.disable_static()
+
+	    wmt16 = WMT16(mode='train', src_dict_size=50, trg_dict_size=50)
+
+	    for i in range(10):
+		src_ids, trg_ids, trg_ids_next = wmt16[i]
+		src_ids = paddle.to_tensor(src_ids)
+		trg_ids = paddle.to_tensor(trg_ids)
+		trg_ids_next = paddle.to_tensor(trg_ids_next)
+
+		model = SimpleNet()
+		src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
+		print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy())
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 src_dict_size=-1,
+                 trg_dict_size=-1,
+                 lang='en',
+                 download=True):
+        assert mode.lower() in ['train', 'test', 'val'], \
+            "mode should be 'train', 'test' or 'val', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, DATA_URL, DATA_MD5, 'wmt16', download)
+
+        self.lang = lang
+        assert src_dict_size > 0, "dict_size should be set as positive number"
+        assert trg_dict_size > 0, "dict_size should be set as positive number"
+        self.src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if lang == "en"
+                                                 else TOTAL_DE_WORDS))
+        self.trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if lang == "en"
+                                                 else TOTAL_EN_WORDS))
+
+        # load source and target word dict
+        self.src_dict = self._load_dict(lang, src_dict_size)
+        self.trg_dict = self._load_dict("de" if lang == "en" else "en",
+                                        trg_dict_size)
+
+        # load data
+        self.data = self._load_data()
+
+    def _load_dict(self, lang, dict_size, reverse=False):
+        dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                                 "wmt16/%s_%d.dict" % (lang, dict_size))
+        dict_found = False
+        if os.path.exists(dict_path):
+            with open(dict_path, "rb") as d:
+                dict_found = len(d.readlines()) == dict_size
+        if not dict_found:
+            self._build_dict(dict_path, dict_size, lang)
+
+        word_dict = {}
+        with open(dict_path, "rb") as fdict:
+            for idx, line in enumerate(fdict):
+                if reverse:
+                    word_dict[idx] = cpt.to_text(line.strip())
+                else:
+                    word_dict[cpt.to_text(line.strip())] = idx
+        return word_dict
+
+    def _build_dict(self, dict_path, dict_size, lang):
+        word_dict = defaultdict(int)
+        with tarfile.open(self.data_file, mode="r") as f:
+            for line in f.extractfile("wmt16/train"):
+                line = cpt.to_text(line)
+                line_split = line.strip().split("\t")
+                if len(line_split) != 2: continue
+                sen = line_split[0] if self.lang == "en" else line_split[1]
+                for w in sen.split():
+                    word_dict[w] += 1
+
+        with open(dict_path, "wb") as fout:
+            fout.write(
+                cpt.to_bytes("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)))
+            for idx, word in enumerate(
+                    sorted(
+                        six.iteritems(word_dict),
+                        key=lambda x: x[1],
+                        reverse=True)):
+                if idx + 3 == dict_size: break
+                fout.write(cpt.to_bytes(word[0]))
+                fout.write(cpt.to_bytes('\n'))
+
+    def _load_data(self):
+        # the index for start mark, end mark, and unk are the same in source
+        # language and target language. Here uses the source language
+        # dictionary to determine their indices.
+        start_id = self.src_dict[START_MARK]
+        end_id = self.src_dict[END_MARK]
+        unk_id = self.src_dict[UNK_MARK]
+
+        src_col = 0 if self.lang == "en" else 1
+        trg_col = 1 - src_col
+
+        self.src_ids = []
+        self.trg_ids = []
+        self.trg_ids_next = []
+        with tarfile.open(self.data_file, mode="r") as f:
+            for line in f.extractfile("wmt16/{}".format(self.mode)):
+                line = cpt.to_text(line)
+                line_split = line.strip().split("\t")
+                if len(line_split) != 2:
+                    continue
+                src_words = line_split[src_col].split()
+                src_ids = [start_id] + [
+                    self.src_dict.get(w, unk_id) for w in src_words
+                ] + [end_id]
+
+                trg_words = line_split[trg_col].split()
+                trg_ids = [self.trg_dict.get(w, unk_id) for w in trg_words]
+
+                trg_ids_next = trg_ids + [end_id]
+                trg_ids = [start_id] + trg_ids
+
+                self.src_ids.append(src_ids)
+                self.trg_ids.append(trg_ids)
+                self.trg_ids_next.append(trg_ids_next)
+
+    def __getitem__(self, idx):
+        return (np.array(self.src_ids[idx]), np.array(self.trg_ids[idx]),
+                np.array(self.trg_ids_next[idx]))
+
+    def __len__(self):
+        return len(self.src_ids)
+
+    def get_dict(self, lang, reverse=False):
+        """
+	return the word dictionary for the specified language.
+
+	Args:
+	    lang(string): A string indicating which language is the source
+			  language. Available options are: "en" for English
+			  and "de" for Germany.
+	    reverse(bool): If reverse is set to False, the returned python
+			   dictionary will use word as key and use index as value.
+			   If reverse is set to True, the returned python
+			   dictionary will use index as key and word as value.
+
+	Returns:
+	    dict: The word dictionary for the specific language.
+	"""
+
+        dict_size = self.src_dict_size if lang == self.lang else self.trg_dict_size
+
+        dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                                 "wmt16/%s_%d.dict" % (lang, dict_size))
+        assert os.path.exists(dict_path), "Word dictionary does not exist. "
+        "Please invoke paddle.dataset.wmt16.train/test/validation first "
+        "to build the dictionary."
+        return _load_dict(lang, dict_size)
diff --git a/python/paddle/incubate/hapi/metrics.py b/python/paddle/incubate/hapi/metrics.py
deleted file mode 100644
index 9e9a2e78524022d7de8ca80a7fb8e3c478dacd36..0000000000000000000000000000000000000000
--- a/python/paddle/incubate/hapi/metrics.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import abc
-import numpy as np
-import paddle.fluid as fluid
-
-import logging
-
-FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
-logging.basicConfig(level=logging.INFO, format=FORMAT)
-logger = logging.getLogger(__name__)
-
-__all__ = ['Metric', 'Accuracy']
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Metric(object):
-    """
-    Base class for metric, encapsulates metric logic and APIs
-    Usage:
-        
-        m = SomeMetric()
-        for prediction, label in ...:
-            m.update(prediction, label)
-        m.accumulate()
-        
-    Advanced usage for :code:`add_metric_op`
-    Metric calculation can be accelerated by calculating metric states
-    from model outputs and labels by Paddle OPs in :code:`add_metric_op`,
-    metric states will be fetch as numpy array and call :code:`update`
-    with states in numpy format.
-    Metric calculated as follows (operations in Model and Metric are
-    indicated with curly brackets, while data nodes not):
-                 inputs & labels              || ------------------
-                       |                      ||
-                    {model}                   ||
-                       |                      ||
-                outputs & labels              ||
-                       |                      ||    tensor data
-             {Metric.add_metric_op}           ||
-                       |                      ||
-              metric states(tensor)           ||
-                       |                      ||
-                {fetch as numpy}              || ------------------
-                       |                      ||
-              metric states(numpy)            ||    numpy data
-                       |                      ||
-                {Metric.update}               \/ ------------------
-    Examples:
-        
-        For :code:`Accuracy` metric, which takes :code:`pred` and :code:`label`
-        as inputs, we can calculate the correct prediction matrix between
-        :code:`pred` and :code:`label` in :code:`add_metric_op`.
-        For examples, prediction results contains 10 classes, while :code:`pred`
-        shape is [N, 10], :code:`label` shape is [N, 1], N is mini-batch size,
-        and we only need to calculate accurary of top-1 and top-5, we could
-        calculated the correct prediction matrix of the top-5 scores of the
-        prediction of each sample like follows, while the correct prediction
-        matrix shape is [N, 5].
-        .. code-block:: python
-            def add_metric_op(pred, label):
-                # sort prediction and slice the top-5 scores
-                pred = fluid.layers.argsort(pred, descending=True)[1][:, :5]
-                # calculate whether the predictions are correct
-                correct = pred == label
-                return fluid.layers.cast(correct, dtype='float32')
-        With the :code:`add_metric_op`, we split some calculations to OPs(which
-        may run on GPU devices, will be faster), and only fetch 1 tensor with
-        shape as [N, 5] instead of 2 tensors with shapes as [N, 10] and [N, 1].
-        :code:`update` can be define as follows:
-        .. code-block:: python
-            def update(self, correct):
-                accs = []
-                for i, k in enumerate(self.topk):
-                    num_corrects = correct[:, :k].sum()
-                    num_samples = len(correct)
-                    accs.append(float(num_corrects) / num_samples)
-                    self.total[i] += num_corrects
-                    self.count[i] += num_samples
-                return accs
-    """
-
-    def __init__(self):
-        pass
-
-    @abc.abstractmethod
-    def reset(self):
-        """
-        Reset states and result
-        """
-        raise NotImplementedError("function 'reset' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    @abc.abstractmethod
-    def update(self, *args):
-        """
-        Update states for metric
-
-        Inputs of :code:`update` is the outputs of :code:`Metric.add_metric_op`,
-        if :code:`add_metric_op` is not defined, the inputs of :code:`update`
-        will be flatten arguments of **output** of mode and **label** from data:
-        :code:`update(output1, output2, ..., label1, label2,...)`
-
-        see :code:`Metric.add_metric_op`
-        """
-        raise NotImplementedError("function 'update' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    @abc.abstractmethod
-    def accumulate(self):
-        """
-        Accumulates statistics, computes and returns the metric value
-        """
-        raise NotImplementedError(
-            "function 'accumulate' not implemented in {}.".format(
-                self.__class__.__name__))
-
-    @abc.abstractmethod
-    def name(self):
-        """
-        Returns metric name
-        """
-        raise NotImplementedError("function 'name' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    def add_metric_op(self, *args):
-        """
-        This API is advanced usage to accelerate metric calculating, calulations
-        from outputs of model to the states which should be updated by Metric can
-        be defined here, where Paddle OPs is also supported. Outputs of this API
-        will be the inputs of "Metric.update".
-
-        If :code:`add_metric_op` is defined, it will be called with **outputs**
-        of model and **labels** from data as arguments, all outputs and labels
-        will be concatenated and flatten and each filed as a separate argument
-        as follows:
-        :code:`add_metric_op(output1, output2, ..., label1, label2,...)`
-
-        If :code:`add_metric_op` is not defined, default behaviour is to pass
-        input to output, so output format will be:
-        :code:`return output1, output2, ..., label1, label2,...`
-
-        see :code:`Metric.update`
-        """
-        return args
-
-
-class Accuracy(Metric):
-    """
-    Encapsulates accuracy metric logic
-
-    Examples:
-        
-        .. code-block:: python
-
-        import paddle
-        import paddle.fluid as fluid
-        import paddle.incubate.hapi as hapi
-
-        fluid.enable_dygraph()
-
-        train_dataset = hapi.datasets.MNIST(mode='train')
-
-        model = hapi.Model(hapi.vision.LeNet(classifier_activation=None))
-        optim = fluid.optimizer.Adam(
-            learning_rate=0.001, parameter_list=model.parameters())
-        model.prepare(
-            optim,
-            loss_function=paddle.nn.CrossEntropyLoss(),
-            metrics=hapi.metrics.Accuracy())
-
-        model.fit(train_dataset, batch_size=64)
-
-    """
-
-    def __init__(self, topk=(1, ), name=None, *args, **kwargs):
-        super(Accuracy, self).__init__(*args, **kwargs)
-        self.topk = topk
-        self.maxk = max(topk)
-        self._init_name(name)
-        self.reset()
-
-    def add_metric_op(self, pred, label, *args):
-        pred = fluid.layers.argsort(pred, descending=True)[1][:, :self.maxk]
-        correct = pred == label
-        return fluid.layers.cast(correct, dtype='float32')
-
-    def update(self, correct, *args):
-        accs = []
-        for i, k in enumerate(self.topk):
-            num_corrects = correct[:, :k].sum()
-            num_samples = len(correct)
-            accs.append(float(num_corrects) / num_samples)
-            self.total[i] += num_corrects
-            self.count[i] += num_samples
-        return accs
-
-    def reset(self):
-        self.total = [0.] * len(self.topk)
-        self.count = [0] * len(self.topk)
-
-    def accumulate(self):
-        res = []
-        for t, c in zip(self.total, self.count):
-            res.append(float(t) / c)
-        return res
-
-    def _init_name(self, name):
-        name = name or 'acc'
-        if self.maxk != 1:
-            self._name = ['{}_top{}'.format(name, k) for k in self.topk]
-        else:
-            self._name = [name]
-
-    def name(self):
-        return self._name
diff --git a/python/paddle/incubate/hapi/model.py b/python/paddle/incubate/hapi/model.py
index 0b12987b10a0510e1035e2b64439de9abe3fcf31..e4a6b03f7aa5c7f537dc476d8f80162e530d5dbe 100644
--- a/python/paddle/incubate/hapi/model.py
+++ b/python/paddle/incubate/hapi/model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,19 +24,27 @@ import six
 import warnings
 from collections import Iterable
 
+import paddle
 from paddle import fluid
-from paddle.fluid.framework import in_dygraph_mode, Variable
+from paddle.fluid import core
+from paddle.fluid.framework import in_dygraph_mode, Variable, ParamBase, _current_expected_place
+# Note: Use alias `Input` temporarily before releasing hapi feature.
+from paddle.static import InputSpec as Input
 from paddle.fluid.executor import global_scope
 from paddle.fluid.io import is_belong_to_optimizer
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
 from paddle.fluid.incubate.fleet.base import role_maker
+from paddle.fluid.executor import scope_guard, Executor
 from paddle.io import DataLoader, Dataset
 
+from paddle.fluid.dygraph.layers import Layer
+from paddle.metric import Metric
+
 from .distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
-from .metrics import Metric
 from .callbacks import config_callbacks
 from .utils import to_list, to_numpy, flatten_list, restore_flatten_list, extract_args
 from .device import _get_device
@@ -47,40 +55,6 @@ __all__ = [
 ]
 
 
-class Input(fluid.dygraph.Layer):
-    """
-    Define inputs the model.
-
-    Args:
-        name (str): The name/alias of the variable, see :ref:`api_guide_Name`
-            for more details.
-        shape (tuple(integers)|list[integers]): List|Tuple of integers
-            declaring the shape. You can set "None" or -1 at a dimension
-            to indicate the dimension can be of any size. For example,
-            it is useful to set changeable batch size as "None" or -1.
-        dtype (np.dtype|VarType|str, optional): The type of the data. Supported
-            dtype: bool, float16, float32, float64, int8, int16, int32, int64,
-            uint8. Default: float32.
-
-    Examples:
-        .. code-block:: python
-
-        import paddle.incubate.hapi as hapi
-
-        input = hapi.Input('x', [None, 784], 'float32')
-        label = hapi.Input('label', [None, 1], 'int64')
-    """
-
-    def __init__(self, name, shape=None, dtype='float32'):
-        super(Input, self).__init__()
-        self.shape = shape
-        self.dtype = dtype
-        self.name = name
-
-    def forward(self):
-        return fluid.data(self.name, shape=self.shape, dtype=self.dtype)
-
-
 class StaticGraphAdapter(object):
     """
     Model traning/inference with a static graph.
@@ -388,13 +362,13 @@ class StaticGraphAdapter(object):
         with fluid.program_guard(prog, self._startup_prog):
             inputs = self.model._inputs
             labels = self.model._labels if self.model._labels else []
-            inputs = [k.forward() for k in to_list(inputs)]
-            labels = [k.forward() for k in to_list(labels)]
+            inputs = [k._create_feed_layer() for k in to_list(inputs)]
+            labels = [k._create_feed_layer() for k in to_list(labels)]
             self._label_vars[mode] = labels
             outputs = to_list(self.model.network.forward(*inputs))
 
-            if mode != 'test' and self.model._loss_function:
-                losses = self.model._loss_function(*(outputs + labels))
+            if mode != 'test' and self.model._loss:
+                losses = self.model._loss(*(outputs + labels))
 
             if self._nranks > 1 and mode != 'train':
                 outputs = [_all_gather(o, self._nranks) for o in outputs]
@@ -403,8 +377,7 @@ class StaticGraphAdapter(object):
 
             if mode != 'test':
                 for metric in self.model._metrics:
-                    metrics.append(
-                        to_list(metric.add_metric_op(*(outputs + labels))))
+                    metrics.append(to_list(metric.compute(*(outputs + labels))))
 
             if mode == 'train' and self.model._optimizer:
                 self._loss_endpoint = fluid.layers.sum(losses)
@@ -509,7 +482,7 @@ class DynamicGraphAdapter(object):
 
         if self._nranks > 1:
             outputs = self.ddp_model.forward(* [to_variable(x) for x in inputs])
-            losses = self.model._loss_function(*(to_list(outputs) + labels))
+            losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
             final_loss = fluid.layers.sum(losses)
             final_loss = self.ddp_model.scale_loss(final_loss)
@@ -518,7 +491,7 @@ class DynamicGraphAdapter(object):
         else:
             outputs = self.model.network.forward(
                 * [to_variable(x) for x in inputs])
-            losses = self.model._loss_function(*(to_list(outputs) + labels))
+            losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
             final_loss = fluid.layers.sum(losses)
             final_loss.backward()
@@ -527,7 +500,7 @@ class DynamicGraphAdapter(object):
         self.model.network.clear_gradients()
         metrics = []
         for metric in self.model._metrics:
-            metric_outs = metric.add_metric_op(*(to_list(outputs) + labels))
+            metric_outs = metric.compute(*(to_list(outputs) + labels))
             m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
@@ -542,8 +515,8 @@ class DynamicGraphAdapter(object):
         labels = [to_variable(l) for l in to_list(labels)]
 
         outputs = self.model.network.forward(* [to_variable(x) for x in inputs])
-        if self.model._loss_function:
-            losses = self.model._loss_function(*(to_list(outputs) + labels))
+        if self.model._loss:
+            losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
 
         if self._nranks > 1:
@@ -571,13 +544,13 @@ class DynamicGraphAdapter(object):
                     self._merge_count[self.mode + '_total'] += samples
                     self._merge_count[self.mode + '_batch'] = samples
 
-            metric_outs = metric.add_metric_op(*(to_list(outputs) + labels))
+            metric_outs = metric.compute(*(to_list(outputs) + labels))
             m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
-        if self.model._loss_function and len(metrics):
+        if self.model._loss and len(metrics):
             return [to_numpy(l) for l in losses], metrics
-        elif self.model._loss_function:
+        elif self.model._loss:
             return [to_numpy(l) for l in losses]
         else:
             return metrics
@@ -665,21 +638,21 @@ class Model(object):
     """
     An Model object is network with training and inference features.
     Dynamic graph and static graph are supported at the same time,
-    switched by `fluid.enable_dygraph()`. The usage is as follows.
+    switched by `paddle.disable_static()`. The usage is as follows.
     But note, the switching between dynamic and static should be before
     instantiating a Model. The input description, i.e, hapi.Input,
     must be required for static graph.
 
     Args:
-        network (fluid.dygraph.Layer): The network is an instance of
-            fluid.dygraph.Layer.
+        network (paddle.nn.Layer): The network is an instance of
+            paddle.nn.Layer.
         inputs (Input|list|dict|None): `inputs`, entry points of network,
             could be a Input layer, or lits of Input layers,
             or dict (name: Input), or None. For static graph,
             inputs must be set. For dynamic graph, it could be None.
         labels (Input|list|None): `labels`, entry points of network,
             could be a Input layer or lits of Input layers, or None.
-            For static graph, if labels is required in loss_function,
+            For static graph, if labels is required in loss,
             labels must be set. Otherwise, it could be None.
 
 
@@ -687,13 +660,12 @@ class Model(object):
         .. code-block:: python
 
         import paddle
-        import paddle.fluid as fluid
         import paddle.incubate.hapi as hapi
         
-        class MyNet(fluid.dygraph.Layer):
+        class MyNet(paddle.nn.Layer):
             def __init__(self, classifier_act=None):
                 super(MyNet, self).__init__()
-                self._fc1 = fluid.dygraph.Linear(784, 200, act=classifier_act)
+                self._fc1 = paddle.nn.Linear(784, 200, act=classifier_act)
 
             def forward(self, x):
                 y = self._fc1(x)
@@ -701,18 +673,18 @@ class Model(object):
         
         device = hapi.set_device('gpu')
         # if use static graph, do not set
-        fluid.enable_dygraph(device)
+        paddle.disable_static(device)
         
         # inputs and labels are not required for dynamic graph.
-        input = hapi.Input('x', [None, 784], 'float32')
-        label = hapi.Input('label', [None, 1], 'int64')
+        input = hapi.Input([None, 784], 'float32', 'x')
+        label = hapi.Input([None, 1], 'int64', 'label')
         
         model = hapi.Model(MyNet(), input, label)
-        optim = fluid.optimizer.SGD(learning_rate=1e-3,
+        optim = paddle.optimizer.SGD(learning_rate=1e-3,
             parameter_list=model.parameters())
         model.prepare(optim,
                       paddle.nn.CrossEntropyLoss(),
-                      hapi.metrics.Accuracy())
+                      paddle.metric.Accuracy())
         
         mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
         model.fit(mnist_data, epochs=2, batch_size=32, verbose=1)
@@ -724,7 +696,7 @@ class Model(object):
         self.network = network
         self._inputs = None
         self._labels = None
-        self._loss_function = None
+        self._loss = None
         self._loss_weights = None
         self._optimizer = None
         self._optimizer = None
@@ -734,16 +706,8 @@ class Model(object):
             if not isinstance(inputs, (list, dict, Input)):
                 raise TypeError(
                     "'inputs' must be list or dict in static graph mode")
-        if inputs is None:
-            self._inputs = [Input(name=n) \
-                for n in extract_args(self.network.forward) if n != 'self']
-        elif isinstance(input, dict):
-            self._inputs = [inputs[n] \
-                for n in extract_args(self.network.forward) if n != 'self']
-        else:
-            self._inputs = to_list(inputs)
-
-        self._labels = to_list(labels)
+        self._inputs = self._verify_spec(inputs, True)
+        self._labels = self._verify_spec(labels)
 
         # init backend
         if fluid.in_dygraph_mode():
@@ -772,25 +736,24 @@ class Model(object):
             
               import numpy as np
               import paddle
-              import paddle.fluid as fluid
               import paddle.incubate.hapi as hapi
 
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self, classifier_act=None):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 10, act=classifier_act)
+                      self._fc = paddle.nn.Linear(784, 10, act=classifier_act)
 
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device)
+              paddle.disable_static(device)
 
-              input = hapi.Input('x', [None, 784], 'float32')
-              label = hapi.Input('label', [None, 1], 'int64')
+              input = hapi.Input([None, 784], 'float32', 'x')
+              label = hapi.Input([None, 1], 'int64', 'label')
               model = hapi.Model(MyNet(), input, label)
-              optim = fluid.optimizer.SGD(learning_rate=1e-3,
+              optim = paddle.optimizer.SGD(learning_rate=1e-3,
                   parameter_list=model.parameters())
               model.prepare(optim, paddle.nn.CrossEntropyLoss())
               data = np.random.random(size=(4,784)).astype(np.float32)
@@ -821,25 +784,24 @@ class Model(object):
             
               import numpy as np
               import paddle
-              import paddle.fluid as fluid
               import paddle.incubate.hapi as hapi
 
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self, classifier_act=None):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 10, act=classifier_act)
+                      self._fc = paddle.nn.Linear(784, 10, act=classifier_act)
 
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device)
+              paddle.disable_static(device)
 
-              input = hapi.Input('x', [None, 784], 'float32')
-              label = hapi.Input('label', [None, 1], 'int64')
+              input = hapi.Input([None, 784], 'float32', 'x')
+              label = hapi.Input([None, 1], 'int64', 'label')
               model = hapi.Model(MyNet(), input, label)
-              optim = fluid.optimizer.SGD(learning_rate=1e-3,
+              optim = paddle.optimizer.SGD(learning_rate=1e-3,
                   parameter_list=model.parameters())
               model.prepare(optim,
                             paddle.nn.CrossEntropyLoss())
@@ -867,46 +829,54 @@ class Model(object):
             .. code-block:: python
             
               import numpy as np
-              import paddle.fluid as fluid
+              import paddle
               import paddle.incubate.hapi as hapi
 
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
+                      self._fc = paddle.nn.Linear(784, 1, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device)
+              paddle.disable_static(device)
 
               model = hapi.Model(MyNet())
               model.prepare()
               data = np.random.random(size=(4,784)).astype(np.float32)
-              out = model.eval_batch([data])
+              out = model.test_batch([data])
               print(out)
         """
         return self._adapter.test_batch(inputs)
 
-    def save(self, path):
-        """
-        This function saves parameters, optimizer infomation to path.
+    def save(self, path, training=True):
+        """  
+        This function saves parameters, optimizer information or model and 
+        paramters only for inference to path. It depends on the parameter
+        `training`.
 
-        The parameters contains all the trainable Variable, will save to
-        a file with suffix ".pdparams".
+        If `training` is set to True, the parameters saved contain all 
+        the trainable Variable, will save to a file with suffix ".pdparams".
         The optimizer information contains all the variable used by optimizer.
         For Adam optimizer, contains beta1, beta2, momentum etc. All the
         information will save to a file with suffix ".pdopt". (If the optimizer
         have no variable need to save (like SGD), the fill will not generated).
+        This function will silently overwrite existing file at the target location.
 
-        This function will silently overwrite existing file
-        at the target location.
+        If `training` is set to False, only inference model will be saved. It 
+        should be noted that before using `save`, you should run the model, and 
+        the shape of input you saved is as same as the input of its running.
+        `@paddle.jit.to_static` must be added on `forward` function of your layer 
+        in dynamic mode now and these will be optimized later.
 
         Args:
             path (str): The file prefix to save model. The format is
                 'dirname/file_prefix' or 'file_prefix'. if empty str. A exception
                  will be raised.
+            training (bool, optional): Whether to save for training. If not, save
+                for inference only. Default: True.
 
         Returns:
             None
@@ -914,25 +884,47 @@ class Model(object):
         Examples:
 
             .. code-block:: python
-            
-              import paddle.fluid as fluid
-              import paddle.incubate.hapi as hapi
-              
-              class MyNet(fluid.dygraph.Layer):
-                  def __init__(self):
-                      super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
+                import paddle
+                import paddle.incubate.hapi as hapi
+                from paddle.nn import Linear
+                from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
+
+                class Mnist(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyNet, self).__init__()
+                        self._fc = Linear(784, 1, act='softmax')
+
+                  @paddle.jit.to_static # If save for inference in dygraph, need this
                   def forward(self, x):
                       y = self._fc(x)
                       return y
-              
-              device = hapi.set_device('cpu')
-              fluid.enable_dygraph(device)
-              model = hapi.Model(MyNet())
-              model.save('checkpoint/test')
+
+                dynamic = True # False
+                device = hapi.set_device('cpu')
+                # if use static graph, do not set
+                paddle.disable_static(device) if dynamic else None
+
+                # inputs and labels are not required for dynamic graph.
+                input = hapi.Input([None, 784], 'float32', 'x')
+                label = hapi.Input([None, 1], 'int64', 'label')
+
+                model = hapi.Model(Mnist(), input, label)
+                optim = paddle.optimizer.SGD(learning_rate=1e-3,
+                    parameter_list=model.parameters())
+                model.prepare(optim,
+                                paddle.nn.CrossEntropyLoss(),
+                                hapi.metrics.Accuracy())
+                mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
+                model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
+                model.save('checkpoint/test') # save for training
+                model.save('inference_model', False) # save for inference
         """
+
         if ParallelEnv().local_rank == 0:
-            self._adapter.save(path)
+            if not training:
+                self._save_inference_model(path)
+            else:
+                self._adapter.save(path)
 
     def load(self, path, skip_mismatch=False, reset_optimizer=False):
         """
@@ -967,19 +959,19 @@ class Model(object):
 
             .. code-block:: python
             
-              import paddle.fluid as fluid
+              import paddle
               import paddle.incubate.hapi as hapi
               
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
+                      self._fc = paddle.nn.Linear(784, 1, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
               
               device = hapi.set_device('cpu')
-              fluid.enable_dygraph(device)
+              paddle.disable_static(device)
               model = hapi.Model(MyNet())
               model.load('checkpoint/test')
         """
@@ -1042,24 +1034,24 @@ class Model(object):
 
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.incubate.hapi import Model
 
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(20, 10, act='softmax')
+                      self._fc = paddle.nn.Linear(20, 10, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
-              fluid.enable_dygraph()
+              paddle.disable_static()
               model = Model(MyNet())
               params = model.parameters()
         """
         return self._adapter.parameters()
 
-    def prepare(self, optimizer=None, loss_function=None, metrics=None):
+    def prepare(self, optimizer=None, loss=None, metrics=None):
         """
         Configures the model before runing.
 
@@ -1067,8 +1059,8 @@ class Model(object):
             optimizer (Optimizer|None): Optimizer must be set in training
                 and should be a Optimizer instance. It can be None in eval
                 and test mode.
-            loss_function (Loss|callable function|None): Loss function can
-                be a `fluid.dygraph.Layer` instance or any callable function
+            loss (Loss|callable function|None): Loss function can
+                be a `paddle.nn.Layer` instance or any callable function
                 taken the predicted values and ground truth values as input.
                 It can be None when there is no loss.
             metrics (Metric|list of Metric|None): If metrics is set, all
@@ -1087,7 +1079,7 @@ class Model(object):
                     startup_prog_seed = fluid.default_startup_program(
                     ).random_seed
                     fluid.disable_dygraph()
-                    fluid.enable_dygraph(self._place)
+                    paddle.disable_static(self._place)
                     # enable_dygraph would create and switch to a new program,
                     # thus also copy seed to the new program
                     fluid.default_main_program().random_seed = main_prog_seed
@@ -1099,12 +1091,11 @@ class Model(object):
                 _parallel_context_initialized = True
 
         self._optimizer = optimizer
-        if loss_function:
-            if not isinstance(loss_function, fluid.dygraph.Layer) or \
-               not callable(loss_function):
-                raise TypeError("'loss_function' must be sub classes of \
-                    `fluid.dygraph.Layer` or any callable function.")
-        self._loss_function = loss_function
+        if loss is not None:
+            if not isinstance(loss, paddle.nn.Layer) and not callable(loss):
+                raise TypeError("'loss' must be sub classes of " \
+                    "`paddle.nn.Layer` or any callable function.")
+        self._loss = loss
 
         metrics = metrics or []
         for metric in to_list(metrics):
@@ -1184,27 +1175,26 @@ class Model(object):
             .. code-block:: python
 
               import paddle
-              import paddle.fluid as fluid
               import paddle.incubate.hapi as hapi
 
               dynamic = True
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device) if dynamic else None
+              paddle.disable_static(device) if dynamic else None
            
               train_dataset = hapi.datasets.MNIST(mode='train')
               val_dataset = hapi.datasets.MNIST(mode='test')
            
-              input = hapi.Input('image', [None, 1, 28, 28], 'float32')
-              label = hapi.Input('label', [None, 1], 'int64')
+              input = hapi.Input([None, 1, 28, 28], 'float32', 'image')
+              label = hapi.Input([None, 1], 'int64', 'label')
            
               model = hapi.Model(hapi.vision.LeNet(classifier_activation=None),
                   input, label)
-              optim = fluid.optimizer.Adam(
-                  learning_rate=0.001, parameter_list=model.parameters())
+              optim = paddle.optimizer.Adam(
+                  learning_rate=0.001, parameters=model.parameters())
               model.prepare(
                   optim,
                   paddle.nn.CrossEntropyLoss(),
-                  hapi.metrics.Accuracy(topk=(1, 2)))
+                  paddle.metric.Accuracy(topk=(1, 2)))
               model.fit(train_dataset,
                         val_dataset,
                         epochs=2,
@@ -1217,31 +1207,30 @@ class Model(object):
             .. code-block:: python
 
               import paddle
-              import paddle.fluid as fluid
               import paddle.incubate.hapi as hapi
 
               dynamic = True
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device) if dynamic else None
+              paddle.disable_static(device) if dynamic else None
            
               train_dataset = hapi.datasets.MNIST(mode='train')
-              train_loader = fluid.io.DataLoader(train_dataset,
+              train_loader = paddle.io.DataLoader(train_dataset,
                   places=device, batch_size=64)
               val_dataset = hapi.datasets.MNIST(mode='test')
-              val_loader = fluid.io.DataLoader(val_dataset,
+              val_loader = paddle.io.DataLoader(val_dataset,
                   places=device, batch_size=64)
            
-              input = hapi.Input('image', [None, 1, 28, 28], 'float32')
-              label = hapi.Input('label', [None, 1], 'int64')
+              input = hapi.Input([None, 1, 28, 28], 'float32', 'image')
+              label = hapi.Input([None, 1], 'int64', 'label')
            
               model = hapi.Model(hapi.vision.LeNet(classifier_activation=None),
                   input, label)
-              optim = fluid.optimizer.Adam(
-                  learning_rate=0.001, parameter_list=model.parameters())
+              optim = paddle.optimizer.Adam(
+                  learning_rate=0.001, parameters=model.parameters())
               model.prepare(
                   optim,
                   paddle.nn.CrossEntropyLoss(),
-                  hapi.metrics.Accuracy(topk=(1, 2)))
+                  paddle.metric.Accuracy(topk=(1, 2)))
               model.fit(train_loader,
                         val_loader,
                         epochs=2,
@@ -1353,24 +1342,24 @@ class Model(object):
         Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import paddle.incubate.hapi as hapi
 
             # declarative mode
             val_dataset = hapi.datasets.MNIST(mode='test')
 
-            input = hapi.Input('image', [-1, 1, 28, 28], 'float32')
-            label = hapi.Input('label', [None, 1], 'int64')
+            input = hapi.Input([-1, 1, 28, 28], 'float32', 'image')
+            label = hapi.Input([None, 1], 'int64', 'label')
             model = hapi.Model(hapi.vision.LeNet(), input, label)
-            model.prepare(metrics=hapi.metrics.Accuracy())
+            model.prepare(metrics=paddle.metric.Accuracy())
 
             result = model.evaluate(val_dataset, batch_size=64)
             print(result)
 
             # imperative mode
-            fluid.enable_dygraph()
+            paddle.disable_static()
             model = hapi.Model(hapi.vision.LeNet())
-            model.prepare(metrics=hapi.metrics.Accuracy())
+            model.prepare(metrics=paddle.metric.Accuracy())
             result = model.evaluate(val_dataset, batch_size=64)
             print(result)
                 
@@ -1433,12 +1422,13 @@ class Model(object):
             num_workers (int): The number of subprocess to load data, 0 for no subprocess 
                 used and loading data in main process. When train_data and eval_data are
                 both the instance of Dataloader, this argument will be ignored. Default: 0.
-            stack_output (bool): Whether stack output field like a batch, as for an output
+            stack_outputs (bool): Whether stack output field like a batch, as for an output
                 filed of a sample is in shape [X, Y], test_data contains N samples, predict
                 output field will be in shape [N, X, Y] if stack_output is True, and will
                 be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs
                 is False. stack_outputs as False is used for LoDTensor output situation,
                 it is recommended set as True if outputs contains no LoDTensor. Default: False.
+            callbacks(Callback): A Callback instance, default None.
         Returns:
             list: output of models.
 
@@ -1446,7 +1436,7 @@ class Model(object):
         .. code-block:: python
 
             import numpy as np
-            import paddle.fluid as fluid
+            import paddle
             import paddle.incubate.hapi as hapi
 
             class MnistDataset(hapi.datasets.MNIST):
@@ -1466,7 +1456,7 @@ class Model(object):
             test_dataset = MnistDataset(mode='test', return_label=False)
 
             # declarative mode
-            input = hapi.Input('image', [-1, 1, 28, 28], 'float32')
+            input = hapi.Input([-1, 1, 28, 28], 'float32', 'image')
             model = hapi.Model(hapi.vision.LeNet(), input)
             model.prepare()
 
@@ -1475,7 +1465,7 @@ class Model(object):
 
             # imperative mode
             device = hapi.set_device('cpu')
-            fluid.enable_dygraph(device)
+            paddle.disable_static(device)
             model = hapi.Model(hapi.vision.LeNet())
             model.prepare()
             result = model.predict(test_dataset, batch_size=64)
@@ -1519,13 +1509,17 @@ class Model(object):
         cbks.on_end('test', logs)
         return outputs
 
-    def save_inference_model(self,
-                             save_dir,
-                             model_filename=None,
-                             params_filename=None,
-                             model_only=False):
+    def _save_inference_model(self,
+                              save_dir,
+                              model_filename=None,
+                              params_filename=None,
+                              model_only=False):
         """
-        Save inference model must in static mode.
+        Save inference model can be in static or dynamic mode.
+        It should be noted that before using `save_inference_model`, you should
+        run the model, and the shape you saved is as same as the input of its
+        running. `@paddle.jit.to_static` must be added on `forward` function of
+        your layer in dynamic mode now and these will be optimized later.
 
         Args:
             save_dir (str): The directory path to save the inference model.
@@ -1541,40 +1535,142 @@ class Model(object):
         Returns:
             list: The fetch variables' name list
 
-
         Examples:
         .. code-block:: python
+            import numpy as np
+            import paddle
+            from paddle.static import InputSpec
 
-            import paddle.fluid as fluid
             import paddle.incubate.hapi as hapi
-
-            input = hapi.Input('image', [-1, 1, 28, 28], 'float32')
-            model = hapi.Model(hapi.vision.LeNet(), input)
-            model.prepare()
-
+            from paddle.nn import Linear
+            from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
+
+            class Mnist(Layer):
+                def __init__(self, classifier_act=None):
+                    super(Mnist, self).__init__()
+
+                    self.fc = Linear(input_dim=784, output_dim=10, act="softmax")
+
+                @paddle.jit.to_static # In static mode, you need to delete this.
+                def forward(self, inputs):
+                    outputs = self.fc(inputs)
+                    return outputs
+
+            dynamic = True # False
+            device = hapi.set_device('gpu')
+
+            # if use static graph, do not set
+            paddle.disable_static(device) if dynamic else None
+
+            # inputs and labels are not required for dynamic graph.
+            input = InputSpec([None, 784], 'float32', 'x')
+            label = InputSpec([None, 1], 'int64', 'label')
+
+            model = hapi.Model(Mnist(), input, label)
+            optim = paddle.optimizer.SGD(learning_rate=1e-3,
+                parameter_list=model.parameters())
+            model.prepare(optim,
+                            paddle.nn.CrossEntropyLoss(),
+                            hapi.metrics.Accuracy())
+            mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
+            model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
             model.save_inference_model('inference_model')
         """
-        assert not fluid.in_dygraph_mode(
-        ), 'Save inference model must in static mode!'
 
-        prog = self._adapter._progs.get('test', None)
-        assert prog, \
-            "Model is not ready, please call `model.prepare()` first"
+        def get_inout_spec(all_vars, return_name=False):
+            result_list = []
+            valid_vars = [var for var in all_vars if isinstance(var, Variable)]
+            result_list = valid_vars
+            if return_name:
+                result_list = [var.name for var in result_list]
 
-        infer_prog = prog.clone(for_test=True)
+            return result_list
 
-        input_names = [v.name for v in self._adapter._input_vars['test']]
-        endpoints = self._adapter._endpoints['test']['output']
+        # TODO:
+        # 1. Make it Unnecessary to run model before calling `save_inference_model` for users in dygraph.
+        # 2. Save correct shape of input, now the interface stores the shape that the user sent to 
+        #    the inputs of the model in running.
+        # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
+        if fluid.in_dygraph_mode():
+            layer = self.network
+            fluid.disable_dygraph()
+
+            # 1. input check
+            prog_translator = ProgramTranslator()
+            if not prog_translator.enable_declarative:
+                raise RuntimeError(
+                    "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
+                )
+            if not isinstance(layer, Layer):
+                raise TypeError(
+                    "The input layer should be 'Layer', but received layer type is %s."
+                    % type(layer))
+
+            # 2. get program of declarative Layer.forward
+            concrete_program = layer.forward.concrete_program
+
+            # NOTE: we maintain the mapping of variable name to
+            # structured name, the buffer variable (non-persistable)
+            # saved to inference program may not need by dygraph Layer,
+            # we only record the state_dict variable's structured name
+            state_names_dict = dict()
+            for structured_name, var in layer.state_dict().items():
+                state_names_dict[var.name] = structured_name
+
+            # 3. share parameters from Layer to scope & record var info
+            scope = core.Scope()
+            extra_var_info = dict()
+            for param_or_buffer in concrete_program.parameters:
+                # share to scope
+                param_or_buffer_tensor = scope.var(
+                    param_or_buffer.name).get_tensor()
+                src_tensor = param_or_buffer.value().get_tensor()
+                param_or_buffer_tensor._share_data_with(src_tensor)
+                # record var info
+                extra_info_dict = dict()
+                if param_or_buffer.name in state_names_dict:
+                    extra_info_dict['structured_name'] = state_names_dict[
+                        param_or_buffer.name]
+                extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
+                if isinstance(param_or_buffer, ParamBase):
+                    extra_info_dict['trainable'] = param_or_buffer.trainable
+                extra_var_info[param_or_buffer.name] = extra_info_dict
+
+            # 4. build input & output spec
+            input_var_names = get_inout_spec(concrete_program.inputs, True)
+            output_vars = get_inout_spec(concrete_program.outputs)
+
+            # 5. save inference model
+            with scope_guard(scope):
+                return fluid.io.save_inference_model(
+                    dirname=save_dir,
+                    feeded_var_names=input_var_names,
+                    target_vars=output_vars,
+                    executor=Executor(_current_expected_place()),
+                    main_program=concrete_program.main_program.clone(),
+                    model_filename=model_filename,
+                    params_filename=params_filename,
+                    program_only=model_only)
+
+        else:
+            prog = self._adapter._progs.get('test', None)
+            assert prog, \
+                "Model is not ready, please call `model.prepare()` first"
 
-        return fluid.io.save_inference_model(
-            save_dir,
-            input_names,
-            endpoints,
-            self._adapter._executor,
-            main_program=infer_prog,
-            model_filename=model_filename,
-            params_filename=params_filename,
-            program_only=model_only)
+            infer_prog = prog.clone(for_test=True)
+
+            input_names = [v.name for v in self._adapter._input_vars['test']]
+            endpoints = self._adapter._endpoints['test']['output']
+
+            return fluid.io.save_inference_model(
+                save_dir,
+                input_names,
+                endpoints,
+                self._adapter._executor,
+                main_program=infer_prog,
+                model_filename=model_filename,
+                params_filename=params_filename,
+                program_only=model_only)
 
     def _run_one_epoch(self, data_loader, callbacks, mode, logs={}):
         outputs = []
@@ -1601,9 +1697,9 @@ class Model(object):
             if mode != 'test':
                 outs = getattr(self, mode + '_batch')(data[:len(self._inputs)],
                                                       data[len(self._inputs):])
-                if self._metrics and self._loss_function:
+                if self._metrics and self._loss:
                     metrics = [[l[0] for l in outs[0]]]
-                elif self._loss_function:
+                elif self._loss:
                     metrics = [[l[0] for l in outs]]
                 else:
                     metrics = []
@@ -1639,12 +1735,43 @@ class Model(object):
             return logs, outputs
         return logs
 
+    def _verify_spec(self, specs, is_input=False):
+        out_specs = []
+
+        if specs is None:
+            # Note(Aurelius84): If not specific specs of `Input`, using argument names of `forward` function
+            # to generate `Input`. But how can we know the actual shape of each input tensor?
+            if is_input:
+                out_specs = [
+                    Input(
+                        name=n, shape=[None])
+                    for n in extract_args(self.network.forward) if n != 'self'
+                ]
+            else:
+                out_specs = to_list(specs)
+        elif isinstance(specs, dict):
+            assert is_input == False
+            out_specs = [specs[n] \
+                for n in extract_args(self.network.forward) if n != 'self']
+        else:
+            out_specs = to_list(specs)
+        # Note: checks each element has specificed `name`.
+        if out_specs is not None:
+            for i, spec in enumerate(out_specs):
+                assert isinstance(spec, Input)
+                if spec.name is None:
+                    raise ValueError(
+                        "Requires Input[{}].name != None, but receive `None` with {}.".
+                        format(i, spec))
+
+        return out_specs
+
     def _reset_metrics(self):
         for metric in self._metrics:
             metric.reset()
 
     def _metrics_name(self):
-        metrics_name = ['loss'] if self._loss_function else []
+        metrics_name = ['loss'] if self._loss else []
         for m in self._metrics:
             metrics_name.extend(to_list(m.name()))
         return metrics_name
diff --git a/python/paddle/incubate/hapi/tests/CMakeLists.txt b/python/paddle/incubate/hapi/tests/CMakeLists.txt
index 5cad495de7c88781de50de9b2bbe1a765a45582f..8ffcd67443f1c8722560da20d9cfb76b18a67351 100644
--- a/python/paddle/incubate/hapi/tests/CMakeLists.txt
+++ b/python/paddle/incubate/hapi/tests/CMakeLists.txt
@@ -12,6 +12,7 @@ endforeach()
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 150) 
 
 
 function(py_dist_test TARGET_NAME)
diff --git a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
index b338f3310b4c796e66d88b21f1bb8353dbf5b572..ede99a50c2fa72da3bd1999204a5fe1e5a656be2 100644
--- a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
@@ -25,7 +25,7 @@ from paddle import fluid
 from paddle.incubate.hapi import Model, Input, set_device
 from paddle.nn.layer.loss import CrossEntropyLoss
 from paddle.incubate.hapi.vision.models import LeNet
-from paddle.incubate.hapi.metrics import Accuracy
+from paddle.metric import Accuracy
 from paddle.incubate.hapi.callbacks import ProgBarLogger
 from paddle.incubate.hapi.datasets import MNIST
 
@@ -64,8 +64,8 @@ class TestDistTraning(unittest.TestCase):
         im_shape = (-1, 1, 28, 28)
         batch_size = 128
 
-        inputs = [Input('image', im_shape, 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input(im_shape, 'float32', 'image')]
+        labels = [Input([None, 1], 'int64', 'label')]
 
         model = Model(LeNet(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.Momentum(
diff --git a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
index 1484620a4efdfff0c084153e9edb001833d744ef..28305fc6a6fd08c160f946920e85391cd444caef 100644
--- a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
@@ -25,7 +25,7 @@ from paddle import fluid
 from paddle.incubate.hapi import Model, Input, set_device
 from paddle.nn.layer.loss import CrossEntropyLoss
 from paddle.incubate.hapi.vision.models import LeNet
-from paddle.incubate.hapi.metrics import Accuracy
+from paddle.metric import Accuracy
 from paddle.incubate.hapi.callbacks import ProgBarLogger
 from paddle.incubate.hapi.datasets import MNIST
 
@@ -63,8 +63,8 @@ class TestDistTraning(unittest.TestCase):
         im_shape = (-1, 1, 28, 28)
         batch_size = 128
 
-        inputs = [Input('image', im_shape, 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input(im_shape, 'float32', 'image')]
+        labels = [Input([None, 1], 'int64', 'label')]
 
         model = Model(LeNet(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.Momentum(
diff --git a/python/paddle/incubate/hapi/tests/test_callbacks.py b/python/paddle/incubate/hapi/tests/test_callbacks.py
index 2a8a470736d921628edadb55b7e0cc956e2f37f1..e49bf215c276c8b495b0f991a5821d4c674f48d2 100644
--- a/python/paddle/incubate/hapi/tests/test_callbacks.py
+++ b/python/paddle/incubate/hapi/tests/test_callbacks.py
@@ -36,7 +36,7 @@ class TestCallbacks(unittest.TestCase):
         freq = 2
         eval_steps = 20
 
-        inputs = [Input('image', [None, 1, 28, 28], 'float32')]
+        inputs = [Input([None, 1, 28, 28], 'float32', 'image')]
         lenet = Model(LeNet(), inputs)
         lenet.prepare()
 
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_cifar.py b/python/paddle/incubate/hapi/tests/test_dataset_cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..08d9f4353c0ed639f5ad907c921bf7b2c88271f5
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_cifar.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestCifar10Train(unittest.TestCase):
+    def test_main(self):
+        cifar = Cifar10(mode='train')
+        self.assertTrue(len(cifar) == 50000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 50000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(0 <= int(label) <= 9)
+
+
+class TestCifar10Test(unittest.TestCase):
+    def test_main(self):
+        cifar = Cifar10(mode='test')
+        self.assertTrue(len(cifar) == 10000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 10000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(0 <= int(label) <= 9)
+
+
+class TestCifar100Train(unittest.TestCase):
+    def test_main(self):
+        cifar = Cifar100(mode='train')
+        self.assertTrue(len(cifar) == 50000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 50000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(0 <= int(label) <= 99)
+
+
+class TestCifar100Test(unittest.TestCase):
+    def test_main(self):
+        cifar = Cifar100(mode='test')
+        self.assertTrue(len(cifar) == 10000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 10000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(0 <= int(label) <= 99)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_conll05.py b/python/paddle/incubate/hapi/tests/test_dataset_conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ed2a4180d0cb341f5d57bdf1cb9d8ef145a44fb
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_conll05.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestConll05st(unittest.TestCase):
+    def test_main(self):
+        conll05st = Conll05st()
+        self.assertTrue(len(conll05st) == 5267)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 5267)
+        sample = conll05st[idx]
+        self.assertTrue(len(sample) == 9)
+        for s in sample:
+            self.assertTrue(len(s.shape) == 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_imdb.py b/python/paddle/incubate/hapi/tests/test_dataset_imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..cef73634b6b5fb114fa88b785bb77a87fe129bd5
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_imdb.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestImdbTrain(unittest.TestCase):
+    def test_main(self):
+        imdb = Imdb(mode='train')
+        self.assertTrue(len(imdb) == 25000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 25000)
+        data, label = imdb[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
+        self.assertTrue(int(label) in [0, 1])
+
+
+class TestImdbTest(unittest.TestCase):
+    def test_main(self):
+        imdb = Imdb(mode='test')
+        self.assertTrue(len(imdb) == 25000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 25000)
+        data, label = imdb[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
+        self.assertTrue(int(label) in [0, 1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_imikolov.py b/python/paddle/incubate/hapi/tests/test_dataset_imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3d97d314acbf7f55a8482fd386581fef7f16e03
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_imikolov.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestImikolovTrain(unittest.TestCase):
+    def test_main(self):
+        imikolov = Imikolov(mode='train', data_type='NGRAM', window_size=2)
+        self.assertTrue(len(imikolov) == 929589)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 929589)
+        data = imikolov[idx]
+        self.assertTrue(len(data) == 2)
+
+
+class TestImikolovTest(unittest.TestCase):
+    def test_main(self):
+        imikolov = Imikolov(mode='test', data_type='NGRAM', window_size=2)
+        self.assertTrue(len(imikolov) == 82430)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 82430)
+        data = imikolov[idx]
+        self.assertTrue(len(data) == 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py b/python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae8a7a3035ee0e86f8ee2fa9e8a23f6036758d2d
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestMovieReviewsTrain(unittest.TestCase):
+    def test_main(self):
+        movie_reviews = MovieReviews(mode='train')
+        self.assertTrue(len(movie_reviews) == 1600)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1600)
+        data = movie_reviews[idx]
+        self.assertTrue(len(data) == 2)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(int(data[1]) in [0, 1])
+
+
+class TestMovieReviewsTest(unittest.TestCase):
+    def test_main(self):
+        movie_reviews = MovieReviews(mode='test')
+        self.assertTrue(len(movie_reviews) == 400)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 400)
+        data = movie_reviews[idx]
+        self.assertTrue(len(data) == 2)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(int(data[1]) in [0, 1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_movielens.py b/python/paddle/incubate/hapi/tests/test_dataset_movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..f94269f930e05e04b3bdfc4324e5ae1ea15b1fb9
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_movielens.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestMovielensTrain(unittest.TestCase):
+    def test_main(self):
+        movielens = Movielens(mode='train')
+        # movielens dataset random split train/test
+        # not check dataset length here
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 900000)
+        data = movielens[idx]
+        self.assertTrue(len(data) == 8)
+        for i, d in enumerate(data):
+            self.assertTrue(len(d.shape) == 1)
+            if i not in [5, 6]:
+                self.assertTrue(d.shape[0] == 1)
+
+
+class TestMovielensTest(unittest.TestCase):
+    def test_main(self):
+        movielens = Movielens(mode='test')
+        # movielens dataset random split train/test
+        # not check dataset length here
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 100000)
+        data = movielens[idx]
+        self.assertTrue(len(data) == 8)
+        for i, d in enumerate(data):
+            self.assertTrue(len(d.shape) == 1)
+            if i not in [5, 6]:
+                self.assertTrue(d.shape[0] == 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py b/python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..768367bff9911a352ea6b13f279d5b71938bc85b
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestUCIHousingTrain(unittest.TestCase):
+    def test_main(self):
+        uci_housing = UCIHousing(mode='train')
+        self.assertTrue(len(uci_housing) == 404)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 404)
+        data = uci_housing[idx]
+        self.assertTrue(len(data) == 2)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(data[0].shape[0] == 13)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(data[1].shape[0] == 1)
+
+
+class TestUCIHousingTest(unittest.TestCase):
+    def test_main(self):
+        uci_housing = UCIHousing(mode='test')
+        self.assertTrue(len(uci_housing) == 102)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 102)
+        data = uci_housing[idx]
+        self.assertTrue(len(data) == 2)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(data[0].shape[0] == 13)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(data[1].shape[0] == 1)
+
+
+class TestWMT14Train(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='train', dict_size=50)
+        self.assertTrue(len(wmt14) == 191155)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 191155)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT14Test(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='test', dict_size=50)
+        self.assertTrue(len(wmt14) == 5957)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 5957)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT14Gen(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='gen', dict_size=50)
+        self.assertTrue(len(wmt14) == 3001)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 3001)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_voc.py b/python/paddle/incubate/hapi/tests/test_dataset_voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..85766ab8e30a3a7abd5e2966e6353b116c03e926
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_voc.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import voc2012, VOC2012
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+# VOC2012 is too large for unittest to download, stub a small dataset here
+voc2012.VOC_URL = 'https://paddlemodels.bj.bcebos.com/voc2012_stub/VOCtrainval_11-May-2012.tar'
+voc2012.VOC_MD5 = '34cb1fe5bdc139a5454b25b16118fff8'
+
+
+class TestVOC2012Train(unittest.TestCase):
+    def test_main(self):
+        voc2012 = VOC2012(mode='train')
+        self.assertTrue(len(voc2012) == 3)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 3)
+        image, label = voc2012[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(len(label.shape) == 2)
+
+
+class TestVOC2012Valid(unittest.TestCase):
+    def test_main(self):
+        voc2012 = VOC2012(mode='valid')
+        self.assertTrue(len(voc2012) == 1)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1)
+        image, label = voc2012[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(len(label.shape) == 2)
+
+
+class TestVOC2012Test(unittest.TestCase):
+    def test_main(self):
+        voc2012 = VOC2012(mode='test')
+        self.assertTrue(len(voc2012) == 2)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1)
+        image, label = voc2012[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(len(label.shape) == 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_wmt.py b/python/paddle/incubate/hapi/tests/test_dataset_wmt.py
new file mode 100644
index 0000000000000000000000000000000000000000..987e55676aadb77582c58b13e626d7258f3c75b5
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_wmt.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestWMT14Train(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='train', dict_size=50)
+        self.assertTrue(len(wmt14) == 191155)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 191155)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT14Test(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='test', dict_size=50)
+        self.assertTrue(len(wmt14) == 5957)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 5957)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT14Gen(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='gen', dict_size=50)
+        self.assertTrue(len(wmt14) == 3001)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 3001)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT16Train(unittest.TestCase):
+    def test_main(self):
+        wmt16 = WMT16(
+            mode='train', src_dict_size=50, trg_dict_size=50, lang='en')
+        self.assertTrue(len(wmt16) == 29000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 29000)
+        data = wmt16[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT16Test(unittest.TestCase):
+    def test_main(self):
+        wmt16 = WMT16(
+            mode='test', src_dict_size=50, trg_dict_size=50, lang='en')
+        self.assertTrue(len(wmt16) == 1000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1000)
+        data = wmt16[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT16Val(unittest.TestCase):
+    def test_main(self):
+        wmt16 = WMT16(mode='val', src_dict_size=50, trg_dict_size=50, lang='en')
+        self.assertTrue(len(wmt16) == 1014)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1014)
+        data = wmt16[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_metrics.py b/python/paddle/incubate/hapi/tests/test_metrics.py
deleted file mode 100644
index 19c94b73f61a29004ea41b573aa1dd0fb9c0c8e6..0000000000000000000000000000000000000000
--- a/python/paddle/incubate/hapi/tests/test_metrics.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import os
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-from paddle.fluid.dygraph.base import to_variable
-
-from paddle.incubate.hapi.metrics import *
-from paddle.incubate.hapi.utils import to_list
-
-
-def accuracy(pred, label, topk=(1, )):
-    maxk = max(topk)
-    pred = np.argsort(pred)[:, ::-1][:, :maxk]
-    correct = (pred == np.repeat(label, maxk, 1))
-
-    batch_size = label.shape[0]
-    res = []
-    for k in topk:
-        correct_k = correct[:, :k].sum()
-        res.append(correct_k / batch_size)
-    return res
-
-
-def convert_to_one_hot(y, C):
-    oh = np.random.choice(np.arange(C), C, replace=False).astype('float32') / C
-    oh = np.tile(oh[np.newaxis, :], (y.shape[0], 1))
-    for i in range(y.shape[0]):
-        oh[i, int(y[i])] = 1.
-    return oh
-
-
-class TestAccuracyDynamic(unittest.TestCase):
-    def setUp(self):
-        self.topk = (1, )
-        self.class_num = 5
-        self.sample_num = 1000
-        self.name = None
-
-    def random_pred_label(self):
-        label = np.random.randint(0, self.class_num,
-                                  (self.sample_num, 1)).astype('int64')
-        pred = np.random.randint(0, self.class_num,
-                                 (self.sample_num, 1)).astype('int32')
-        pred_one_hot = convert_to_one_hot(pred, self.class_num)
-        pred_one_hot = pred_one_hot.astype('float32')
-
-        return label, pred_one_hot
-
-    def test_main(self):
-        with fluid.dygraph.guard(fluid.CPUPlace()):
-            acc = Accuracy(topk=self.topk, name=self.name)
-            for _ in range(10):
-                label, pred = self.random_pred_label()
-                label_var = to_variable(label)
-                pred_var = to_variable(pred)
-                state = to_list(acc.add_metric_op(pred_var, label_var))
-                acc.update(* [s.numpy() for s in state])
-                res_m = acc.accumulate()
-                res_f = accuracy(pred, label, self.topk)
-                assert np.all(np.isclose(np.array(res_m, dtype='float64'), np.array(res_f, dtype='float64'), rtol=1e-3)), \
-                        "Accuracy precision error: {} != {}".format(res_m, res_f)
-                acc.reset()
-                assert np.sum(acc.total) == 0
-                assert np.sum(acc.count) == 0
-
-
-class TestAccuracyDynamicMultiTopk(TestAccuracyDynamic):
-    def setUp(self):
-        self.topk = (1, 5)
-        self.class_num = 10
-        self.sample_num = 1000
-        self.name = "accuracy"
-
-
-class TestAccuracyStatic(TestAccuracyDynamic):
-    def test_main(self):
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-        with fluid.program_guard(main_prog, startup_prog):
-            pred = fluid.data(
-                name='pred', shape=[None, self.class_num], dtype='float32')
-            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            acc = Accuracy(topk=self.topk, name=self.name)
-            state = acc.add_metric_op(pred, label)
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        compiled_main_prog = fluid.CompiledProgram(main_prog)
-
-        for _ in range(10):
-            label, pred = self.random_pred_label()
-            state_ret = exe.run(compiled_main_prog,
-                                feed={'pred': pred,
-                                      'label': label},
-                                fetch_list=[s.name for s in to_list(state)],
-                                return_numpy=True)
-            acc.update(*state_ret)
-            res_m = acc.accumulate()
-            res_f = accuracy(pred, label, self.topk)
-            assert np.all(np.isclose(np.array(res_m, dtype='float64'), np.array(res_f, dtype='float64'), rtol=1e-3)), \
-                    "Accuracy precision error: {} != {}".format(res_m, res_f)
-            acc.reset()
-            assert np.sum(acc.total) == 0
-            assert np.sum(acc.count) == 0
-
-
-class TestAccuracyStaticMultiTopk(TestAccuracyStatic):
-    def setUp(self):
-        self.topk = (1, 5)
-        self.class_num = 10
-        self.sample_num = 1000
-        self.name = "accuracy"
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_model.py b/python/paddle/incubate/hapi/tests/test_model.py
index cf659514248ec610ee893d6715d6e1260f48abea..7fc471aa1e2eeb80ae81d4a32b09eeff74193e6f 100644
--- a/python/paddle/incubate/hapi/tests/test_model.py
+++ b/python/paddle/incubate/hapi/tests/test_model.py
@@ -22,17 +22,20 @@ import numpy as np
 import shutil
 import tempfile
 
+import paddle
 from paddle import fluid
-from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
+from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
 from paddle.fluid.dygraph.base import to_variable
 
 import paddle.incubate.hapi as hapi
 from paddle.incubate.hapi import Model, Input
 from paddle.nn.layer.loss import CrossEntropyLoss
-from paddle.incubate.hapi.metrics import Accuracy
+from paddle.metric import Accuracy
 from paddle.incubate.hapi.datasets import MNIST
 from paddle.incubate.hapi.vision.models import LeNet
 from paddle.incubate.hapi.distributed import DistributedBatchSampler, prepare_distributed_context
+from paddle.fluid.dygraph.jit import declarative
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
 
 
 class LeNetDygraph(fluid.dygraph.Layer):
@@ -40,21 +43,19 @@ class LeNetDygraph(fluid.dygraph.Layer):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
-            Conv2D(
+            Conv2d(
                 1, 6, 3, stride=1, padding=1),
             ReLU(),
             Pool2D(2, 'max', 2),
-            Conv2D(
+            Conv2d(
                 6, 16, 5, stride=1, padding=0),
             ReLU(),
             Pool2D(2, 'max', 2))
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120),
-                Linear(120, 84),
-                Linear(
-                    84, 10, act=classifier_activation))
+                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                Softmax())  #Todo: accept any activation
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -65,6 +66,35 @@ class LeNetDygraph(fluid.dygraph.Layer):
         return x
 
 
+class LeNetDeclarative(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10, classifier_activation=None):
+        super(LeNetDeclarative, self).__init__()
+        self.num_classes = num_classes
+        self.features = Sequential(
+            Conv2d(
+                1, 6, 3, stride=1, padding=1),
+            ReLU(),
+            Pool2D(2, 'max', 2),
+            Conv2d(
+                6, 16, 5, stride=1, padding=0),
+            ReLU(),
+            Pool2D(2, 'max', 2))
+
+        if num_classes > 0:
+            self.fc = Sequential(
+                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                Softmax())  #Todo: accept any activation
+
+    @declarative
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        if self.num_classes > 0:
+            x = fluid.layers.flatten(x, 1)
+            x = self.fc(x)
+        return x
+
+
 class MnistDataset(MNIST):
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
@@ -141,8 +171,8 @@ class TestModel(unittest.TestCase):
             cls.test_dataset, places=cls.device, batch_size=64)
 
         seed = 333
-        fluid.default_startup_program().random_seed = seed
-        fluid.default_main_program().random_seed = seed
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
 
         dy_lenet = LeNetDygraph()
         cls.init_param = dy_lenet.state_dict()
@@ -150,8 +180,8 @@ class TestModel(unittest.TestCase):
 
         cls.acc1 = dynamic_evaluate(dy_lenet, cls.val_loader)
 
-        cls.inputs = [Input('image', [-1, 1, 28, 28], 'float32')]
-        cls.labels = [Input('label', [None, 1], 'int64')]
+        cls.inputs = [Input([-1, 1, 28, 28], 'float32', 'image')]
+        cls.labels = [Input([None, 1], 'int64', 'label')]
 
         cls.save_dir = tempfile.mkdtemp()
         cls.weight_path = os.path.join(cls.save_dir, 'lenet')
@@ -193,8 +223,8 @@ class TestModel(unittest.TestCase):
     def fit(self, dynamic, num_replicas=None, rank=None):
         fluid.enable_dygraph(self.device) if dynamic else None
         seed = 333
-        fluid.default_startup_program().random_seed = seed
-        fluid.default_main_program().random_seed = seed
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
 
         net = LeNet(classifier_activation=None)
         optim_new = fluid.optimizer.Adam(
@@ -202,7 +232,7 @@ class TestModel(unittest.TestCase):
         model = Model(net, inputs=self.inputs, labels=self.labels)
         model.prepare(
             optim_new,
-            loss_function=CrossEntropyLoss(reduction="sum"),
+            loss=CrossEntropyLoss(reduction="sum"),
             metrics=Accuracy())
         model.fit(self.train_dataset, batch_size=64, shuffle=False)
 
@@ -287,17 +317,19 @@ class TestModel(unittest.TestCase):
 class MyModel(fluid.dygraph.Layer):
     def __init__(self, classifier_activation='softmax'):
         super(MyModel, self).__init__()
-        self._fc = Linear(20, 10, act=classifier_activation)
+        self._fc = Linear(20, 10)
+        self._act = Softmax()  #Todo: accept any activation
 
     def forward(self, x):
         y = self._fc(x)
+        y = self._act(y)
         return y
 
 
 class TestModelFunction(unittest.TestCase):
     def set_seed(self, seed=1024):
-        fluid.default_startup_program().random_seed = seed
-        fluid.default_main_program().random_seed = seed
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
 
     def test_train_batch(self, dynamic=True):
         dim = 20
@@ -330,13 +362,11 @@ class TestModelFunction(unittest.TestCase):
             optim2 = fluid.optimizer.SGD(learning_rate=0.001,
                                          parameter_list=net.parameters())
 
-            inputs = [Input('x', [None, dim], 'float32')]
-            labels = [Input('label', [None, 1], 'int64')]
+            inputs = [Input([None, dim], 'float32', 'x')]
+            labels = [Input([None, 1], 'int64', 'label')]
             model = Model(net, inputs, labels)
-            model.prepare(
-                optim2, loss_function=CrossEntropyLoss(reduction="sum"))
+            model.prepare(optim2, loss=CrossEntropyLoss(reduction="sum"))
             loss, = model.train_batch([data], [label])
-
             np.testing.assert_allclose(loss.flatten(), ref.flatten())
             fluid.disable_dygraph() if dynamic else None
 
@@ -359,7 +389,7 @@ class TestModelFunction(unittest.TestCase):
             fluid.enable_dygraph(device) if dynamic else None
             self.set_seed()
             net = MyModel()
-            inputs = [Input('x', [None, dim], 'float32')]
+            inputs = [Input([None, dim], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
             out, = model.test_batch([data])
@@ -373,14 +403,13 @@ class TestModelFunction(unittest.TestCase):
             device = hapi.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
             net = MyModel(classifier_activation=None)
-            inputs = [Input('x', [None, 20], 'float32')]
-            labels = [Input('label', [None, 1], 'int64')]
+            inputs = [Input([None, 20], 'float32', 'x')]
+            labels = [Input([None, 1], 'int64', 'label')]
             optim = fluid.optimizer.SGD(learning_rate=0.001,
                                         parameter_list=net.parameters())
             model = Model(net, inputs, labels)
             model.prepare(
-                optimizer=optim,
-                loss_function=CrossEntropyLoss(reduction="sum"))
+                optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
             model.save(path + '/test')
             model.load(path + '/test')
             shutil.rmtree(path)
@@ -394,18 +423,16 @@ class TestModelFunction(unittest.TestCase):
         model = Model(MyModel(classifier_activation=None))
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
-        model.prepare(
-            optimizer=optim, loss_function=CrossEntropyLoss(reduction="sum"))
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.save(path + '/test')
         fluid.disable_dygraph()
 
-        inputs = [Input('x', [None, 20], 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input([None, 20], 'float32', 'x')]
+        labels = [Input([None, 1], 'int64', 'label')]
         model = Model(MyModel(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
-        model.prepare(
-            optimizer=optim, loss_function=CrossEntropyLoss(reduction="sum"))
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.load(path + '/test')
         shutil.rmtree(path)
 
@@ -413,26 +440,24 @@ class TestModelFunction(unittest.TestCase):
         path = tempfile.mkdtemp()
 
         net = MyModel(classifier_activation=None)
-        inputs = [Input('x', [None, 20], 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input([None, 20], 'float32', 'x')]
+        labels = [Input([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=net.parameters())
         model = Model(net, inputs, labels)
-        model.prepare(
-            optimizer=optim, loss_function=CrossEntropyLoss(reduction="sum"))
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.save(path + '/test')
 
         device = hapi.set_device('cpu')
         fluid.enable_dygraph(device)  #if dynamic else None
 
         net = MyModel(classifier_activation=None)
-        inputs = [Input('x', [None, 20], 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input([None, 20], 'float32', 'x')]
+        labels = [Input([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=net.parameters())
         model = Model(net, inputs, labels)
-        model.prepare(
-            optimizer=optim, loss_function=CrossEntropyLoss(reduction="sum"))
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.load(path + '/test')
         shutil.rmtree(path)
         fluid.disable_dygraph()
@@ -442,7 +467,7 @@ class TestModelFunction(unittest.TestCase):
             device = hapi.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
             net = MyModel()
-            inputs = [Input('x', [None, 20], 'float32')]
+            inputs = [Input([None, 20], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
             params = model.parameters()
@@ -451,33 +476,48 @@ class TestModelFunction(unittest.TestCase):
             fluid.disable_dygraph() if dynamic else None
 
     def test_export_deploy_model(self):
-        net = LeNet()
-        inputs = [Input('image', [-1, 1, 28, 28], 'float32')]
-        model = Model(net, inputs)
-        model.prepare()
-        save_dir = tempfile.mkdtemp()
-        if not os.path.exists(save_dir):
-            os.makedirs(save_dir)
-
-        tensor_img = np.array(
-            np.random.random((1, 1, 28, 28)), dtype=np.float32)
-        ori_results = model.test_batch(tensor_img)
-
-        model.save_inference_model(save_dir)
-
-        place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        [inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=save_dir, executor=exe))
+        for dynamic in [True, False]:
+            fluid.enable_dygraph() if dynamic else None
+            # paddle.disable_static() if dynamic else None
+            prog_translator = ProgramTranslator()
+            prog_translator.enable(False) if not dynamic else None
+            net = LeNetDeclarative()
+            inputs = [Input([None, 1, 28, 28], 'float32', 'x')]
+            model = Model(net, inputs)
+            model.prepare()
+            save_dir = tempfile.mkdtemp()
+            if not os.path.exists(save_dir):
+                os.makedirs(save_dir)
+            tensor_img = np.array(
+                np.random.random((1, 1, 28, 28)), dtype=np.float32)
+            ori_results = model.test_batch(tensor_img)
+            model.save(save_dir, training=False)
+            fluid.disable_dygraph() if dynamic else None
 
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_img},
-                          fetch_list=fetch_targets)
+            place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            new_scope = fluid.Scope()
+            with fluid.scope_guard(new_scope):
+                exe = fluid.Executor(place)
+                [inference_program, feed_target_names, fetch_targets] = (
+                    fluid.io.load_inference_model(
+                        dirname=save_dir, executor=exe))
+                results = exe.run(inference_program,
+                                  feed={feed_target_names[0]: tensor_img},
+                                  fetch_list=fetch_targets)
+                np.testing.assert_allclose(
+                    results, ori_results, rtol=1e-5, atol=1e-7)
+                shutil.rmtree(save_dir)
+
+
+class TestRaiseError(unittest.TestCase):
+    def test_input_without_name(self):
+        net = MyModel(classifier_activation=None)
 
-        np.testing.assert_allclose(results, ori_results, rtol=1e-6)
-        shutil.rmtree(save_dir)
+        inputs = [Input([None, 10], 'float32')]
+        labels = [Input([None, 1], 'int64', 'label')]
+        with self.assertRaises(ValueError):
+            model = Model(net, inputs, labels)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/incubate/hapi/tests/test_pretrained_model.py b/python/paddle/incubate/hapi/tests/test_pretrained_model.py
index 588797322f4ab8e9eef9cc184cc6d82635de7d01..334ebff449d4f34c9a5a9b56ee7998b4dbc5abf0 100644
--- a/python/paddle/incubate/hapi/tests/test_pretrained_model.py
+++ b/python/paddle/incubate/hapi/tests/test_pretrained_model.py
@@ -28,7 +28,7 @@ class TestPretrainedModel(unittest.TestCase):
             fluid.enable_dygraph()
 
         net = models.__dict__[arch](pretrained=True, classifier_activation=None)
-        inputs = [Input('image', [None, 3, 224, 224], 'float32')]
+        inputs = [Input([None, 3, 224, 224], 'float32', 'image')]
         model = Model(network=net, inputs=inputs)
         model.prepare()
         res = model.test_batch(x)
diff --git a/python/paddle/incubate/hapi/tests/test_text.py b/python/paddle/incubate/hapi/tests/test_text.py
index 78f089b06a38dec4eb189a9744e503f517f220db..c4fef0d749ce788e50d8cffdf9b7041e33d078af 100644
--- a/python/paddle/incubate/hapi/tests/test_text.py
+++ b/python/paddle/incubate/hapi/tests/test_text.py
@@ -20,6 +20,7 @@ import random
 
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Embedding, Linear, Layer
 from paddle.fluid.layers import BeamSearchDecoder
@@ -87,15 +88,18 @@ class ModuleApiTest(unittest.TestCase):
             fluid.enable_dygraph(place)
         else:
             fluid.disable_dygraph()
-        fluid.default_main_program().random_seed = self._random_seed
-        fluid.default_startup_program().random_seed = self._random_seed
-        layer = self.model_cls(**self.attrs) if isinstance(
-            self.attrs, dict) else self.model_cls(*self.attrs)
-        model = Model(layer, inputs=self.make_inputs())
-        model.prepare()
-        if self.param_states:
-            model.load(self.param_states, optim_state=None)
-        return model.test_batch(self.inputs)
+        gen = paddle.manual_seed(self._random_seed)
+        gen._is_init_py = False
+        paddle.framework.random._manual_program_seed(self._random_seed)
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            layer = self.model_cls(**self.attrs) if isinstance(
+                self.attrs, dict) else self.model_cls(*self.attrs)
+            model = Model(layer, inputs=self.make_inputs())
+            model.prepare()
+            if self.param_states:
+                model.load(self.param_states, optim_state=None)
+            return model.test_batch(self.inputs)
 
     def check_output_with_place(self, place, mode="test"):
         dygraph_output = self._calc_output(place, mode, dygraph=True)
@@ -129,12 +133,9 @@ class TestBasicLSTM(ModuleApiTest):
 
     @staticmethod
     def model_init(model, input_size, hidden_size):
-        model.lstm = RNN(
-            BasicLSTMCell(
-                input_size,
-                hidden_size,
-                param_attr=fluid.ParamAttr(name="lstm_weight"),
-                bias_attr=fluid.ParamAttr(name="lstm_bias")))
+        model.lstm = RNN(BasicLSTMCell(
+            input_size,
+            hidden_size, ))
 
     @staticmethod
     def model_forward(model, inputs):
@@ -142,7 +143,7 @@ class TestBasicLSTM(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -168,7 +169,7 @@ class TestBasicGRU(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -219,8 +220,8 @@ class TestBeamSearch(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("init_hidden", [None, self.inputs[0].shape[-1]], "float32"),
-            Input("init_cell", [None, self.inputs[1].shape[-1]], "float32"),
+            Input([None, self.inputs[0].shape[-1]], "float32", "init_hidden"),
+            Input([None, self.inputs[1].shape[-1]], "float32", "init_cell"),
         ]
         return inputs
 
@@ -272,10 +273,10 @@ class TestTransformerEncoder(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("enc_input", [None, None, self.inputs[0].shape[-1]],
-                  "float32"),
-            Input("attn_bias", [None, self.inputs[1].shape[1], None, None],
-                  "float32"),
+            Input([None, None, self.inputs[0].shape[-1]], "float32",
+                  "enc_input"),
+            Input([None, self.inputs[1].shape[1], None, None], "float32",
+                  "attn_bias"),
         ]
         return inputs
 
@@ -336,14 +337,14 @@ class TestTransformerDecoder(TestTransformerEncoder):
 
     def make_inputs(self):
         inputs = [
-            Input("dec_input", [None, None, self.inputs[0].shape[-1]],
-                  "float32"),
-            Input("enc_output", [None, None, self.inputs[0].shape[-1]],
-                  "float32"),
-            Input("self_attn_bias",
-                  [None, self.inputs[-1].shape[1], None, None], "float32"),
-            Input("cross_attn_bias",
-                  [None, self.inputs[-1].shape[1], None, None], "float32"),
+            Input([None, None, self.inputs[0].shape[-1]], "float32",
+                  "dec_input"),
+            Input([None, None, self.inputs[0].shape[-1]], "float32",
+                  "enc_output"),
+            Input([None, self.inputs[-1].shape[1], None, None], "float32",
+                  "self_attn_bias"),
+            Input([None, self.inputs[-1].shape[1], None, None], "float32",
+                  "cross_attn_bias"),
         ]
         return inputs
 
@@ -431,10 +432,10 @@ class TestTransformerBeamSearchDecoder(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("enc_output", [None, None, self.inputs[0].shape[-1]],
-                  "float32"),
-            Input("trg_src_attn_bias",
-                  [None, self.inputs[1].shape[1], None, None], "float32"),
+            Input([None, None, self.inputs[0].shape[-1]], "float32",
+                  "enc_output"),
+            Input([None, self.inputs[1].shape[1], None, None], "float32",
+                  "trg_src_attn_bias"),
         ]
         return inputs
 
@@ -473,9 +474,9 @@ class TestSequenceTagging(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("word", [None, None], "int64"),
-            Input("lengths", [None], "int64"),
-            Input("target", [None, None], "int64"),
+            Input([None, None], "int64", "word"),
+            Input([None], "int64", "lengths"),
+            Input([None, None], "int64", "target"),
         ]
         return inputs
 
@@ -517,7 +518,7 @@ class TestStackedRNN(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -543,7 +544,7 @@ class TestLSTM(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -579,7 +580,7 @@ class TestBiLSTM(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -609,7 +610,7 @@ class TestGRU(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -645,7 +646,7 @@ class TestBiGRU(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -680,7 +681,7 @@ class TestCNNEncoder(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, self.inputs[-1].shape[1], None], "float32"),
+            Input([None, self.inputs[-1].shape[1], None], "float32", "input"),
         ]
         return inputs
 
diff --git a/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py b/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
index c2035a8b5c5958d54c79d6ee0ff6df654bb35d51..6df9b31217aae78c43de8d29956a8b2def99055b 100644
--- a/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
+++ b/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
@@ -22,7 +22,7 @@ import shutil
 import tempfile
 
 from paddle import fluid
-from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
+from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
 
 from paddle.incubate.hapi.utils import uncombined_weight_to_state_dict
 
@@ -32,21 +32,19 @@ class LeNetDygraph(fluid.dygraph.Layer):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
-            Conv2D(
+            Conv2d(
                 1, 6, 3, stride=1, padding=1),
             ReLU(),
             Pool2D(2, 'max', 2),
-            Conv2D(
+            Conv2d(
                 6, 16, 5, stride=1, padding=0),
             ReLU(),
             Pool2D(2, 'max', 2))
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120),
-                Linear(120, 84),
-                Linear(
-                    84, 10, act=classifier_activation))
+                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                Softmax())  #Todo: accept any activation
 
     def forward(self, inputs):
         x = self.features(inputs)
diff --git a/python/paddle/incubate/hapi/tests/test_vision_models.py b/python/paddle/incubate/hapi/tests/test_vision_models.py
index 16dbe431be801c9cd7ce48c4cd1444b7e0e558a4..2dc9355bcc3005d48b7046123b024fa2a91594c3 100644
--- a/python/paddle/incubate/hapi/tests/test_vision_models.py
+++ b/python/paddle/incubate/hapi/tests/test_vision_models.py
@@ -28,7 +28,7 @@ class TestVisonModels(unittest.TestCase):
         else:
             net = models.__dict__[arch](pretrained=pretrained)
 
-        input = hapi.Input('image', [None, 3, 224, 224], 'float32')
+        input = hapi.Input([None, 3, 224, 224], 'float32', 'image')
         model = hapi.Model(net, input)
         model.prepare()
 
@@ -71,7 +71,7 @@ class TestVisonModels(unittest.TestCase):
         self.models_infer('resnet152')
 
     def test_lenet(self):
-        input = hapi.Input('x', [None, 1, 28, 28], 'float32')
+        input = hapi.Input([None, 1, 28, 28], 'float32', 'x')
         lenet = hapi.Model(models.__dict__['LeNet'](), input)
         lenet.prepare()
 
diff --git a/python/paddle/incubate/hapi/vision/models/lenet.py b/python/paddle/incubate/hapi/vision/models/lenet.py
index db1d894b4aa5f2535795c6350faad6ee3aee1164..169f70562f6edfe1773a1c8d75004c25831cedcb 100644
--- a/python/paddle/incubate/hapi/vision/models/lenet.py
+++ b/python/paddle/incubate/hapi/vision/models/lenet.py
@@ -13,7 +13,7 @@
 #limitations under the License.
 
 import paddle.fluid as fluid
-from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
+from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
 
 __all__ = ['LeNet']
 
@@ -39,21 +39,19 @@ class LeNet(fluid.dygraph.Layer):
         super(LeNet, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
-            Conv2D(
+            Conv2d(
                 1, 6, 3, stride=1, padding=1),
             ReLU(),
             Pool2D(2, 'max', 2),
-            Conv2D(
+            Conv2d(
                 6, 16, 5, stride=1, padding=0),
             ReLU(),
             Pool2D(2, 'max', 2))
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120),
-                Linear(120, 84),
-                Linear(
-                    84, 10, act=classifier_activation))
+                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                Softmax())  #Todo: accept any activation
 
     def forward(self, inputs):
         x = self.features(inputs)
diff --git a/python/paddle/incubate/hapi/vision/models/vgg.py b/python/paddle/incubate/hapi/vision/models/vgg.py
index 74e7228e5249fe990d037c9f12e75b6d4839c591..4352a768eb7206ca30acead580a64a7d04b7701b 100644
--- a/python/paddle/incubate/hapi/vision/models/vgg.py
+++ b/python/paddle/incubate/hapi/vision/models/vgg.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.nn import Conv2d, Pool2D, BatchNorm, Linear, ReLU, Softmax
 from paddle.fluid.dygraph.container import Sequential
 
 from ...download import get_weights_path_from_url
@@ -37,7 +37,8 @@ class Classifier(fluid.dygraph.Layer):
         super(Classifier, self).__init__()
         self.linear1 = Linear(512 * 7 * 7, 4096)
         self.linear2 = Linear(4096, 4096)
-        self.linear3 = Linear(4096, num_classes, act=classifier_activation)
+        self.linear3 = Linear(4096, num_classes)
+        self.act = Softmax()  #Todo: accept any activation
 
     def forward(self, x):
         x = self.linear1(x)
@@ -46,7 +47,8 @@ class Classifier(fluid.dygraph.Layer):
         x = self.linear2(x)
         x = fluid.layers.relu(x)
         x = fluid.layers.dropout(x, 0.5)
-        out = self.linear3(x)
+        x = self.linear3(x)
+        out = self.act(x)
         return out
 
 
@@ -105,12 +107,11 @@ def make_layers(cfg, batch_norm=False):
             layers += [Pool2D(pool_size=2, pool_stride=2)]
         else:
             if batch_norm:
-                conv2d = Conv2D(in_channels, v, filter_size=3, padding=1)
-                layers += [conv2d, BatchNorm(v, act='relu')]
+                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
+                layers += [conv2d, BatchNorm(v), ReLU()]
             else:
-                conv2d = Conv2D(
-                    in_channels, v, filter_size=3, padding=1, act='relu')
-                layers += [conv2d]
+                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
+                layers += [conv2d, ReLU()]
             in_channels = v
     return Sequential(*layers)
 
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index 89bbd5916578b6e3169452d85e581c438f2bbb47..78f792d6a5a6698034912297f5d5a23db0b35201 100644
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -16,6 +16,7 @@
 __all__ = [
     'Dataset',
     'IterableDataset',
+    'TensorDataset',
     'BatchSampler',
     #            'Transform',
     'DataLoader',
@@ -42,7 +43,7 @@ __all__ = [
 
 from ..fluid.io import DataLoader
 from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
-        Sampler, SequenceSampler, RandomSampler
+        TensorDataset, Sampler, SequenceSampler, RandomSampler
 from ..fluid.io import load, save, load_program_state, set_program_state, \
         load_inference_model, save_inference_model, batch
 from ..reader import shuffle, buffered, cache, chain, firstn, compose, map_readers, xmap_readers
diff --git a/python/paddle/jit/__init__.py b/python/paddle/jit/__init__.py
index 47369e3ff9cd87539f9e96708ff981dc67d06420..03299a3bb9823d31c40ae4faab601ed89570c71e 100644
--- a/python/paddle/jit/__init__.py
+++ b/python/paddle/jit/__init__.py
@@ -16,11 +16,13 @@ from ..fluid.dygraph.jit import save  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import load  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import TracedLayer  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import set_code_level  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import set_verbosity  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import declarative as to_static  #DEFINE_ALIAS
 from ..fluid.dygraph import ProgramTranslator  #DEFINE_ALIAS
 from ..fluid.dygraph.io import TranslatedLayer  #DEFINE_ALIAS
 
 __all__ = [
     'save', 'load', 'SaveLoadConfig', 'TracedLayer', 'to_static',
-    'ProgramTranslator', 'TranslatedLayer'
+    'ProgramTranslator', 'TranslatedLayer', 'set_code_level', 'set_verbosity'
 ]
diff --git a/python/paddle/metric/__init__.py b/python/paddle/metric/__init__.py
index e03336f6dbab7b375701e1e694aee0bbbfa4b1cd..6e197881fc0bcbc32f9d9d738237082138f9410b 100644
--- a/python/paddle/metric/__init__.py
+++ b/python/paddle/metric/__init__.py
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the functions to calculate metric in this directory 
-__all__ = [
-    'Accuracy', 'Auc', 'ChunkEvaluator', 'CompositeMetric', 'DetectionMAP',
-    'EditDistance', 'Precision', 'Recall', 'accuracy', 'auc', 'chunk_eval',
-    'cos_sim', 'mean_iou'
-]
-
-
-
-from ..fluid.metrics import Accuracy, Auc, ChunkEvaluator, CompositeMetric, DetectionMAP, EditDistance, \
-        Precision, Recall
+from .metrics import *
+from . import metrics
 
 from ..fluid.layers.metric_op import accuracy, auc
 from ..fluid.layers.nn import chunk_eval, cos_sim, mean_iou
+
+__all__ = metrics.__all__ + [
+    'accuracy',
+    'auc',
+    'chunk_eval',
+    'cos_sim',
+    'mean_iou',
+]
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..110a62c300559b9037cd2ca735aebd1946ba0ce9
--- /dev/null
+++ b/python/paddle/metric/metrics.py
@@ -0,0 +1,738 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import abc
+import numpy as np
+
+import paddle
+
+__all__ = ['Metric', 'Accuracy', 'Precision', 'Recall', 'Auc']
+
+
+def _is_numpy_(var):
+    return isinstance(var, (np.ndarray, np.generic))
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Metric(object):
+    """
+    Base class for metric, encapsulates metric logic and APIs
+    Usage:
+        
+        m = SomeMetric()
+        for prediction, label in ...:
+            m.update(prediction, label)
+        m.accumulate()
+        
+    Advanced usage for :code:`compute`:
+
+    Metric calculation can be accelerated by calculating metric states
+    from model outputs and labels by build-in operators not by Python/NumPy
+    in :code:`compute`, metric states will be fetched as NumPy array and
+    call :code:`update` with states in NumPy format.
+    Metric calculated as follows (operations in Model and Metric are
+    indicated with curly brackets, while data nodes not):
+                 inputs & labels              || ------------------
+                       |                      ||
+                    {model}                   ||
+                       |                      ||
+                outputs & labels              ||
+                       |                      ||    tensor data
+                {Metric.compute}              ||
+                       |                      ||
+              metric states(tensor)           ||
+                       |                      ||
+                {fetch as numpy}              || ------------------
+                       |                      ||
+              metric states(numpy)            ||    numpy data
+                       |                      ||
+                {Metric.update}               \/ ------------------
+    Examples:
+        
+        For :code:`Accuracy` metric, which takes :code:`pred` and :code:`label`
+        as inputs, we can calculate the correct prediction matrix between
+        :code:`pred` and :code:`label` in :code:`compute`.
+        For examples, prediction results contains 10 classes, while :code:`pred`
+        shape is [N, 10], :code:`label` shape is [N, 1], N is mini-batch size,
+        and we only need to calculate accurary of top-1 and top-5, we could
+        calculate the correct prediction matrix of the top-5 scores of the
+        prediction of each sample like follows, while the correct prediction
+        matrix shape is [N, 5].
+
+        .. code-block:: python
+            def compute(pred, label):
+                # sort prediction and slice the top-5 scores
+                pred = paddle.argsort(pred, descending=True)[:, :5]
+                # calculate whether the predictions are correct
+                correct = pred == label
+                return paddle.cast(correct, dtype='float32')
+
+        With the :code:`compute`, we split some calculations to OPs (which
+        may run on GPU devices, will be faster), and only fetch 1 tensor with
+        shape as [N, 5] instead of 2 tensors with shapes as [N, 10] and [N, 1].
+        :code:`update` can be define as follows:
+
+        .. code-block:: python
+            def update(self, correct):
+                accs = []
+                for i, k in enumerate(self.topk):
+                    num_corrects = correct[:, :k].sum()
+                    num_samples = len(correct)
+                    accs.append(float(num_corrects) / num_samples)
+                    self.total[i] += num_corrects
+                    self.count[i] += num_samples
+                return accs
+    """
+
+    def __init__(self):
+        pass
+
+    @abc.abstractmethod
+    def reset(self):
+        """
+        Reset states and result
+        """
+        raise NotImplementedError("function 'reset' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    @abc.abstractmethod
+    def update(self, *args):
+        """
+        Update states for metric
+
+        Inputs of :code:`update` is the outputs of :code:`Metric.compute`,
+        if :code:`compute` is not defined, the inputs of :code:`update`
+        will be flatten arguments of **output** of mode and **label** from data:
+        :code:`update(output1, output2, ..., label1, label2,...)`
+
+        see :code:`Metric.compute`
+        """
+        raise NotImplementedError("function 'update' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    @abc.abstractmethod
+    def accumulate(self):
+        """
+        Accumulates statistics, computes and returns the metric value
+        """
+        raise NotImplementedError(
+            "function 'accumulate' not implemented in {}.".format(
+                self.__class__.__name__))
+
+    @abc.abstractmethod
+    def name(self):
+        """
+        Returns metric name
+        """
+        raise NotImplementedError("function 'name' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    def compute(self, *args):
+        """
+        This API is advanced usage to accelerate metric calculating, calulations
+        from outputs of model to the states which should be updated by Metric can
+        be defined here, where Paddle OPs is also supported. Outputs of this API
+        will be the inputs of "Metric.update".
+
+        If :code:`compute` is defined, it will be called with **outputs**
+        of model and **labels** from data as arguments, all outputs and labels
+        will be concatenated and flatten and each filed as a separate argument
+        as follows:
+        :code:`compute(output1, output2, ..., label1, label2,...)`
+
+        If :code:`compute` is not defined, default behaviour is to pass
+        input to output, so output format will be:
+        :code:`return output1, output2, ..., label1, label2,...`
+
+        see :code:`Metric.update`
+        """
+        return args
+
+
+class Accuracy(Metric):
+    """
+    Encapsulates accuracy metric logic.
+
+    Args:
+        topk (int|tuple(int)): Number of top elements to look at
+            for computing accuracy. Default is (1,).
+        name (str, optional): String name of the metric instance. Default
+            is `acc`.
+
+    Example by standalone:
+        
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+
+        paddle.disable_static()
+        x = paddle.to_tensor(np.array([
+            [0.1, 0.2, 0.3, 0.4],
+            [0.1, 0.4, 0.3, 0.2],
+            [0.1, 0.2, 0.4, 0.3],
+            [0.1, 0.2, 0.3, 0.4]]))
+        y = paddle.to_tensor(np.array([[0], [1], [2], [3]]))
+
+        m = paddle.metric.Accuracy()
+        correct = m.compute(x, y)
+        m.update(correct)
+        res = m.accumulate()
+        print(res) # 0.75
+
+
+    Example with Model API:
+        
+        .. code-block:: python
+
+        import paddle
+        import paddle.incubate.hapi as hapi
+
+        paddle.disable_static()
+        train_dataset = hapi.datasets.MNIST(mode='train')
+
+        model = hapi.Model(hapi.vision.LeNet(classifier_activation=None))
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        model.prepare(
+            optim,
+            loss=paddle.nn.CrossEntropyLoss(),
+            metrics=paddle.metric.Accuracy())
+
+        model.fit(train_dataset, batch_size=64)
+
+    """
+
+    def __init__(self, topk=(1, ), name=None, *args, **kwargs):
+        super(Accuracy, self).__init__(*args, **kwargs)
+        self.topk = topk
+        self.maxk = max(topk)
+        self._init_name(name)
+        self.reset()
+
+    def compute(self, pred, label, *args):
+        """
+        Compute the top-k (maxinum value in `topk`) indices.
+
+        Args:
+            pred (Tensor): The predicted value is a Tensor wit type
+                float32 or float64.
+            label (Tensor): The ground truth value is a 2D Tensor, its
+                shape is [batch_size, 1] and type is int64.
+
+        Return:
+            Tensor: Correct mask, a tensor with shape [batch_size, topk].
+        """
+        pred = paddle.argsort(pred, descending=True)[:, :self.maxk]
+        correct = pred == label
+        return paddle.cast(correct, dtype='float32')
+
+    def update(self, correct, *args):
+        """
+        Update the metrics states (correct count and total count), in order to
+        calculate cumulative accuracy of all instances. This function also
+        returns the accuracy of current step.
+        
+        Args:
+            correct: Correct mask, a tensor with shape [batch_size, topk].
+
+        Return:
+            Tensor: the accuracy of current step.
+        """
+        if isinstance(correct, paddle.Tensor):
+            correct = correct.numpy()
+        accs = []
+        for i, k in enumerate(self.topk):
+            num_corrects = correct[:, :k].sum()
+            num_samples = len(correct)
+            accs.append(float(num_corrects) / num_samples)
+            self.total[i] += num_corrects
+            self.count[i] += num_samples
+        accs = accs[0] if len(self.topk) == 1 else accs
+        return accs
+
+    def reset(self):
+        """
+        Resets all of the metric state.
+        """
+        self.total = [0.] * len(self.topk)
+        self.count = [0] * len(self.topk)
+
+    def accumulate(self):
+        """
+        Computes and returns the accumulated metric.
+        """
+        res = []
+        for t, c in zip(self.total, self.count):
+            r = float(t) / c if c > 0 else 0.
+            res.append(r)
+        res = res[0] if len(self.topk) == 1 else res
+        return res
+
+    def _init_name(self, name):
+        name = name or 'acc'
+        if self.maxk != 1:
+            self._name = ['{}_top{}'.format(name, k) for k in self.topk]
+        else:
+            self._name = [name]
+
+    def name(self):
+        """
+        Return name of metric instance.
+        """
+        return self._name
+
+
+class Precision(Metric):
+    """
+    Precision (also called positive predictive value) is the fraction of
+    relevant instances among the retrieved instances. Refer to
+    https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers
+
+    Noted that this class manages the precision score only for binary
+    classification task.
+
+    Args:
+        name (str, optional): String name of the metric instance.
+            Default is `precision`.
+
+    Example by standalone:
+        
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+
+        x = np.array([0.1, 0.5, 0.6, 0.7])
+        y = np.array([0, 1, 1, 1])
+
+        m = paddle.metric.Precision()
+        m.update(x, y)
+        res = m.accumulate()
+        print(res) # 1.0
+
+
+    Example with Model API:
+        
+        .. code-block:: python
+
+        import numpy as np
+        
+        import paddle
+        import paddle.nn as nn
+        import paddle.incubate.hapi as hapi
+        
+        class Data(paddle.io.Dataset):
+            def __init__(self):
+                super(Data, self).__init__()
+                self.n = 1024
+                self.x = np.random.randn(self.n, 10).astype('float32')
+                self.y = np.random.randint(2, size=(self.n, 1)).astype('float32')
+        
+            def __getitem__(self, idx):
+                return self.x[idx], self.y[idx]
+        
+            def __len__(self):
+                return self.n
+  
+        paddle.disable_static()
+        model = hapi.Model(nn.Sequential(
+            nn.Linear(10, 1),
+            nn.Sigmoid()
+        ))
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        model.prepare(
+            optim,
+            loss=nn.BCELoss(),
+            metrics=paddle.metric.Precision())
+        
+        data = Data()
+        model.fit(data, batch_size=16)
+    """
+
+    def __init__(self, name='precision', *args, **kwargs):
+        super(Precision, self).__init__(*args, **kwargs)
+        self.tp = 0  # true positive
+        self.fp = 0  # false positive
+        self._name = name
+
+    def update(self, preds, labels):
+        """
+        Update the states based on the current mini-batch prediction results.
+
+        Args:
+            preds (numpy.ndarray): The prediction result, usually the output
+                of two-class sigmoid function. It should be a vector (column
+                vector or row vector) with data type: 'float64' or 'float32'.
+            labels (numpy.ndarray): The ground truth (labels),
+                the shape should keep the same as preds.
+                The data type is 'int32' or 'int64'.
+        """
+        if isinstance(preds, paddle.Tensor):
+            preds = preds.numpy()
+        elif not _is_numpy_(preds):
+            raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
+
+        if isinstance(labels, paddle.Tensor):
+            labels = labels.numpy()
+        elif not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
+
+        sample_num = labels.shape[0]
+        preds = np.floor(preds + 0.5).astype("int32")
+
+        for i in range(sample_num):
+            pred = preds[i]
+            label = labels[i]
+            if pred == 1:
+                if pred == label:
+                    self.tp += 1
+                else:
+                    self.fp += 1
+
+    def reset(self):
+        """
+        Resets all of the metric state.
+        """
+        self.tp = 0
+        self.fp = 0
+
+    def accumulate(self):
+        """
+        Calculate the final precision.
+
+        Returns:
+            A scaler float: results of the calculated precision.
+        """
+        ap = self.tp + self.fp
+        return float(self.tp) / ap if ap != 0 else .0
+
+    def name(self):
+        """
+        Returns metric name
+        """
+        return self._name
+
+
+class Recall(Metric):
+    """
+    Recall (also known as sensitivity) is the fraction of
+    relevant instances that have been retrieved over the
+    total amount of relevant instances
+
+    Refer to:
+    https://en.wikipedia.org/wiki/Precision_and_recall
+
+    Noted that this class manages the recall score only for
+    binary classification task.
+
+    Args:
+        name (str, optional): String name of the metric instance.
+            Default is `recall`.
+
+    Example by standalone:
+        
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+
+        x = np.array([0.1, 0.5, 0.6, 0.7])
+        y = np.array([1, 0, 1, 1])
+
+        m = paddle.metric.Recall()
+        m.update(x, y)
+        res = m.accumulate()
+        print(res) # 2.0 / 3.0
+
+
+    Example with Model API:
+        
+        .. code-block:: python
+
+        import numpy as np
+        
+        import paddle
+        import paddle.nn as nn
+        import paddle.incubate.hapi as hapi
+        
+        class Data(paddle.io.Dataset):
+            def __init__(self):
+                super(Data, self).__init__()
+                self.n = 1024
+                self.x = np.random.randn(self.n, 10).astype('float32')
+                self.y = np.random.randint(2, size=(self.n, 1)).astype('float32')
+        
+            def __getitem__(self, idx):
+                return self.x[idx], self.y[idx]
+        
+            def __len__(self):
+                return self.n
+        
+        paddle.disable_static()
+        model = hapi.Model(nn.Sequential(
+            nn.Linear(10, 1),
+            nn.Sigmoid()
+        ))
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        model.prepare(
+            optim,
+            loss=nn.BCELoss(),
+            metrics=[paddle.metric.Precision(), paddle.metric.Recall()])
+        
+        data = Data()
+        model.fit(data, batch_size=16)
+    """
+
+    def __init__(self, name='recall', *args, **kwargs):
+        super(Recall, self).__init__(*args, **kwargs)
+        self.tp = 0  # true positive
+        self.fn = 0  # false negative
+        self._name = name
+
+    def update(self, preds, labels):
+        """
+        Update the states based on the current mini-batch prediction results.
+
+        Args:
+            preds(numpy.array): prediction results of current mini-batch,
+                the output of two-class sigmoid function.
+                Shape: [batch_size, 1]. Dtype: 'float64' or 'float32'.
+            labels(numpy.array): ground truth (labels) of current mini-batch,
+                the shape should keep the same as preds.
+                Shape: [batch_size, 1], Dtype: 'int32' or 'int64'.
+        """
+        if isinstance(preds, paddle.Tensor):
+            preds = preds.numpy()
+        elif not _is_numpy_(preds):
+            raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
+
+        if isinstance(labels, paddle.Tensor):
+            labels = labels.numpy()
+        elif not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
+
+        sample_num = labels.shape[0]
+        preds = np.rint(preds).astype("int32")
+
+        for i in range(sample_num):
+            pred = preds[i]
+            label = labels[i]
+            if label == 1:
+                if pred == label:
+                    self.tp += 1
+                else:
+                    self.fn += 1
+
+    def accumulate(self):
+        """
+        Calculate the final recall.
+
+        Returns:
+            A scaler float: results of the calculated Recall.
+        """
+        recall = self.tp + self.fn
+        return float(self.tp) / recall if recall != 0 else .0
+
+    def reset(self):
+        """
+        Resets all of the metric state.
+        """
+        self.tp = 0
+        self.fn = 0
+
+    def name(self):
+        """
+        Returns metric name
+        """
+        return self._name
+
+
+class Auc(Metric):
+    """
+    The auc metric is for binary classification.
+    Refer to https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve.
+    Please notice that the auc metric is implemented with python, which may be a little bit slow.
+
+    The `auc` function creates four local variables, `true_positives`,
+    `true_negatives`, `false_positives` and `false_negatives` that are used to
+    compute the AUC. To discretize the AUC curve, a linearly spaced set of
+    thresholds is used to compute pairs of recall and precision values. The area
+    under the ROC-curve is therefore computed using the height of the recall
+    values by the false positive rate, while the area under the PR-curve is the
+    computed using the height of the precision values by the recall.
+
+    Args:
+        curve (str): Specifies the mode of the curve to be computed,
+            'ROC' or 'PR' for the Precision-Recall-curve. Default is 'ROC'.
+        num_thresholds (int): The number of thresholds to use when
+            discretizing the roc curve. Default is 4095.
+            'ROC' or 'PR' for the Precision-Recall-curve. Default is 'ROC'.
+        name (str, optional): String name of the metric instance. Default
+            is `auc`.
+
+    "NOTE: only implement the ROC curve type via Python now."
+
+    Example by standalone:
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+
+        m = paddle.metric.Auc()
+        
+        n = 8
+        class0_preds = np.random.random(size = (n, 1))
+        class1_preds = 1 - class0_preds
+        
+        preds = np.concatenate((class0_preds, class1_preds), axis=1)
+        labels = np.random.randint(2, size = (n, 1))
+        
+        m.update(preds=preds, labels=labels)
+        res = m.accumulate()
+
+
+    Example with Model API:
+        
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+        import paddle.nn as nn
+        import paddle.incubate.hapi as hapi
+        
+        class Data(paddle.io.Dataset):
+            def __init__(self):
+                super(Data, self).__init__()
+                self.n = 1024
+                self.x = np.random.randn(self.n, 10).astype('float32')
+                self.y = np.random.randint(2, size=(self.n, 1)).astype('int64')
+        
+            def __getitem__(self, idx):
+                return self.x[idx], self.y[idx]
+        
+            def __len__(self):
+                return self.n
+        
+        paddle.disable_static()
+        model = hapi.Model(nn.Sequential(
+            nn.Linear(10, 2, act='softmax'),
+        ))
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        
+        def loss(x, y):
+            return nn.functional.nll_loss(paddle.log(x), y)
+        
+        model.prepare(
+            optim,
+            loss=loss,
+            metrics=paddle.metric.Auc())
+        data = Data()
+        model.fit(data, batch_size=16)
+    """
+
+    def __init__(self,
+                 curve='ROC',
+                 num_thresholds=4095,
+                 name='auc',
+                 *args,
+                 **kwargs):
+        super(Auc, self).__init__(*args, **kwargs)
+        self._curve = curve
+        self._num_thresholds = num_thresholds
+
+        _num_pred_buckets = num_thresholds + 1
+        self._stat_pos = np.zeros(_num_pred_buckets)
+        self._stat_neg = np.zeros(_num_pred_buckets)
+        self._name = name
+
+    def update(self, preds, labels):
+        """
+        Update the auc curve with the given predictions and labels.
+
+        Args:
+            preds (numpy.array): An numpy array in the shape of
+                (batch_size, 2), preds[i][j] denotes the probability of
+                classifying the instance i into the class j.
+            labels (numpy.array): an numpy array in the shape of
+                (batch_size, 1), labels[i] is either o or 1,
+                representing the label of the instance i.
+        """
+        if isinstance(labels, paddle.Tensor):
+            labels = labels.numpy()
+        elif not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
+
+        if isinstance(preds, paddle.Tensor):
+            preds = preds.numpy()
+        elif not _is_numpy_(preds):
+            raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
+
+        for i, lbl in enumerate(labels):
+            value = preds[i, 1]
+            bin_idx = int(value * self._num_thresholds)
+            assert bin_idx <= self._num_thresholds
+            if lbl:
+                self._stat_pos[bin_idx] += 1.0
+            else:
+                self._stat_neg[bin_idx] += 1.0
+
+    @staticmethod
+    def trapezoid_area(x1, x2, y1, y2):
+        return abs(x1 - x2) * (y1 + y2) / 2.0
+
+    def accumulate(self):
+        """
+        Return the area (a float score) under auc curve
+
+        Return:
+            float: the area under auc curve
+        """
+        tot_pos = 0.0
+        tot_neg = 0.0
+        auc = 0.0
+
+        idx = self._num_thresholds
+        while idx >= 0:
+            tot_pos_prev = tot_pos
+            tot_neg_prev = tot_neg
+            tot_pos += self._stat_pos[idx]
+            tot_neg += self._stat_neg[idx]
+            auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos,
+                                       tot_pos_prev)
+            idx -= 1
+
+        return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
+
+    def reset(self):
+        """
+        Reset states and result
+        """
+        _num_pred_buckets = self._num_thresholds + 1
+        self._stat_pos = np.zeros(_num_pred_buckets)
+        self._stat_neg = np.zeros(_num_pred_buckets)
+
+    def name(self):
+        """
+        Returns metric name
+        """
+        return self._name
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 7b6dcdf7f67dece7dd4e8ebafdc9a1c679dd18ba..76063458d44de3000ad7c1af08376c07e0209c27 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -18,6 +18,8 @@
 from .layer import norm
 from .functional import extension
 from .layer import common
+from .layer import rnn
+from .utils import weight_norm_hook
 
 from . import initializer
 
@@ -25,6 +27,8 @@ __all__ = []
 __all__ += norm.__all__
 __all__ += extension.__all__
 __all__ += common.__all__
+__all__ += rnn.__all__
+__all__ += weight_norm_hook.__all__
 
 # TODO: define alias in nn directory
 # from .clip import ErrorClipByValue        #DEFINE_ALIAS
@@ -49,30 +53,58 @@ from .decode import beam_search_decode  #DEFINE_ALIAS
 # from .decode import ctc_greedy_decoder        #DEFINE_ALIAS
 # from .decode import dynamic_decode        #DEFINE_ALIAS
 from .decode import gather_tree  #DEFINE_ALIAS
-from .input import data  #DEFINE_ALIAS
 # from .input import Input        #DEFINE_ALIAS
 from .layer.activation import ELU
 from .layer.activation import GELU
+from .layer.activation import Tanh
 from .layer.activation import Hardshrink
-# from .layer.activation import PReLU        #DEFINE_ALIAS
+from .layer.activation import Hardtanh
+from .layer.activation import PReLU
 from .layer.activation import ReLU
+from .layer.activation import ReLU6  #DEFINE_ALIAS
+from .layer.activation import SELU  #DEFINE_ALIAS
 from .layer.activation import LeakyReLU  #DEFINE_ALIAS
 from .layer.activation import Sigmoid  #DEFINE_ALIAS
 from .layer.activation import LogSigmoid
-# from .layer.activation import Softmax        #DEFINE_ALIAS
+from .layer.activation import Softmax  #DEFINE_ALIAS
+from .layer.activation import Softplus  #DEFINE_ALIAS
+from .layer.activation import Softshrink  #DEFINE_ALIAS
+from .layer.activation import Softsign  #DEFINE_ALIAS
+from .layer.activation import Tanhshrink  #DEFINE_ALIAS
 from .layer.activation import LogSoftmax  #DEFINE_ALIAS
 from .layer.activation import HSigmoid  #DEFINE_ALIAS
 from .layer.common import BilinearTensorProduct  #DEFINE_ALIAS
 from .layer.common import Pool2D  #DEFINE_ALIAS
 from .layer.common import Pad2D  #DEFINE_ALIAS
+from .layer.common import ReflectionPad1d  #DEFINE_ALIAS
+from .layer.common import ReplicationPad1d  #DEFINE_ALIAS
+from .layer.common import ConstantPad1d  #DEFINE_ALIAS
+from .layer.common import ReflectionPad2d  #DEFINE_ALIAS
+from .layer.common import ReplicationPad2d  #DEFINE_ALIAS
+from .layer.common import ConstantPad2d  #DEFINE_ALIAS
+from .layer.common import ZeroPad2d  #DEFINE_ALIAS
+from .layer.common import ReplicationPad3d  #DEFINE_ALIAS
+from .layer.common import ConstantPad3d  #DEFINE_ALIAS
+from .layer.common import CosineSimilarity  #DEFINE_ALIAS
 from .layer.common import Embedding  #DEFINE_ALIAS
 from .layer.common import Linear  #DEFINE_ALIAS
 from .layer.common import Flatten  #DEFINE_ALIAS
 from .layer.common import UpSample  #DEFINE_ALIAS
-from .layer.conv import Conv2D  #DEFINE_ALIAS
-from .layer.conv import Conv2DTranspose  #DEFINE_ALIAS
-from .layer.conv import Conv3D  #DEFINE_ALIAS
-from .layer.conv import Conv3DTranspose  #DEFINE_ALIAS
+from .layer.common import UpsamplingNearest2d  #DEFINE_ALIAS
+from .layer.common import UpsamplingBilinear2d  #DEFINE_ALIAS
+from .layer.common import Bilinear  #DEFINE_ALIAS
+from .layer.common import Dropout  #DEFINE_ALIAS
+from .layer.common import Dropout2D  #DEFINE_ALIAS
+from .layer.common import Dropout3D  #DEFINE_ALIAS
+from .layer.common import AlphaDropout  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+from .layer.conv import Conv1d  #DEFINE_ALIAS
+from .layer.conv import Conv2d  #DEFINE_ALIAS
+from .layer.conv import Conv3d  #DEFINE_ALIAS
+from .layer.conv import ConvTranspose1d  #DEFINE_ALIAS
+from .layer.conv import ConvTranspose2d  #DEFINE_ALIAS
+from .layer.conv import ConvTranspose3d  #DEFINE_ALIAS
 # from .layer.conv import TreeConv        #DEFINE_ALIAS
 # from .layer.conv import Conv1D        #DEFINE_ALIAS
 from .layer.extension import RowConv  #DEFINE_ALIAS
@@ -83,7 +115,9 @@ from .layer.extension import RowConv  #DEFINE_ALIAS
 # from .layer.learning_rate import NoamDecay        #DEFINE_ALIAS
 # from .layer.learning_rate import PiecewiseDecay        #DEFINE_ALIAS
 # from .layer.learning_rate import PolynomialDecay        #DEFINE_ALIAS
+from .layer.common import Linear
 # from .layer.loss import NCELoss        #DEFINE_ALIAS
+from .layer.loss import BCEWithLogitsLoss  #DEFINE_ALIAS
 from .layer.loss import CrossEntropyLoss  #DEFINE_ALIAS
 from .layer.loss import MSELoss  #DEFINE_ALIAS
 from .layer.loss import L1Loss  #DEFINE_ALIAS
@@ -91,18 +125,36 @@ from .layer.loss import NLLLoss  #DEFINE_ALIAS
 from .layer.loss import BCELoss  #DEFINE_ALIAS
 from .layer.loss import KLDivLoss  #DEFINE_ALIAS
 from .layer.loss import MarginRankingLoss  #DEFINE_ALIAS
+from .layer.loss import CTCLoss  #DEFINE_ALIAS
+from .layer.loss import SmoothL1Loss  #DEFINE_ALIAS
 from .layer.norm import BatchNorm  #DEFINE_ALIAS
 from .layer.norm import SyncBatchNorm  #DEFINE_ALIAS
 from .layer.norm import GroupNorm  #DEFINE_ALIAS
 from .layer.norm import LayerNorm  #DEFINE_ALIAS
 from .layer.norm import SpectralNorm  #DEFINE_ALIAS
 from .layer.norm import InstanceNorm  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm1d  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm2d  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm3d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm1d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm2d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm3d  #DEFINE_ALIAS
+from .layer.rnn import *
 # from .layer.rnn import RNNCell        #DEFINE_ALIAS
 # from .layer.rnn import GRUCell        #DEFINE_ALIAS
 # from .layer.rnn import LSTMCell        #DEFINE_ALIAS
+from .layer.transformer import MultiHeadAttention
+from .layer.transformer import TransformerEncoderLayer
+from .layer.transformer import TransformerEncoder
+from .layer.transformer import TransformerDecoderLayer
+from .layer.transformer import TransformerDecoder
+from .layer.transformer import Transformer
 from .layer.distance import PairwiseDistance  #DEFINE_ALIAS
 
+from .layer.vision import PixelShuffle
+
 from .layer import loss  #DEFINE_ALIAS
 from .layer import conv  #DEFINE_ALIAS
+from .layer import vision  #DEFINE_ALIAS
 from ..fluid.dygraph.layers import Layer  #DEFINE_ALIAS
 from ..fluid.dygraph.container import LayerList, ParameterList, Sequential  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index bc71b8bdf06d2885327bc722b9594208f49478e8..414e70853eb7163230ab2db987fc19c58e168f19 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -25,18 +25,23 @@ from . import extension
 __all__ += extension.__all__
 from . import common
 __all__ += common.__all__
+from . import pooling
+__all__ += pooling.__all__
+from . import loss
+__all__ += loss.__all__
 from .activation import brelu  #DEFINE_ALIAS
 from .activation import elu  #DEFINE_ALIAS
 from .activation import erf  #DEFINE_ALIAS
 from .activation import gelu  #DEFINE_ALIAS
 from .activation import hardshrink  #DEFINE_ALIAS
+from .activation import hardtanh  #DEFINE_ALIAS
 from .activation import hard_sigmoid  #DEFINE_ALIAS
 from .activation import hard_swish  #DEFINE_ALIAS
 from .activation import hsigmoid  #DEFINE_ALIAS
 from .activation import leaky_relu  #DEFINE_ALIAS
 from .activation import logsigmoid  #DEFINE_ALIAS
 from .activation import maxout  #DEFINE_ALIAS
-# from .activation import prelu        #DEFINE_ALIAS
+from .activation import prelu  #DEFINE_ALIAS
 from .activation import relu  #DEFINE_ALIAS
 from .activation import relu6  #DEFINE_ALIAS
 from .activation import selu  #DEFINE_ALIAS
@@ -47,10 +52,14 @@ from .activation import softplus  #DEFINE_ALIAS
 from .activation import softshrink  #DEFINE_ALIAS
 from .activation import softsign  #DEFINE_ALIAS
 from .activation import swish  #DEFINE_ALIAS
-from .activation import tanh_shrink  #DEFINE_ALIAS
+from .activation import tanh  #DEFINE_ALIAS
+from .activation import tanhshrink  #DEFINE_ALIAS
 from .activation import thresholded_relu  #DEFINE_ALIAS
 from .activation import log_softmax  #DEFINE_ALIAS
 from .common import dropout  #DEFINE_ALIAS
+from .common import dropout2d  #DEFINE_ALIAS
+from .common import dropout3d  #DEFINE_ALIAS
+from .common import alpha_dropout  #DEFINE_ALIAS
 # from .common import embedding        #DEFINE_ALIAS
 # from .common import fc  #DEFINE_ALIAS
 from .common import label_smooth  #DEFINE_ALIAS
@@ -58,14 +67,19 @@ from .common import one_hot  #DEFINE_ALIAS
 from .common import pad  #DEFINE_ALIAS
 from .common import pad_constant_like  #DEFINE_ALIAS
 from .common import pad2d  #DEFINE_ALIAS
+from .common import cosine_similarity  #DEFINE_ALIAS
 from .common import unfold  #DEFINE_ALIAS
 # from .common import bilinear_tensor_product        #DEFINE_ALIAS
 from .common import assign  #DEFINE_ALIAS
 from .common import interpolate  #DEFINE_ALIAS
+from .common import bilinear  #DEFINE_ALIAS
+from .conv import conv1d  #DEFINE_ALIAS
+from .conv import conv_transpose1d  #DEFINE_ALIAS
+from .common import linear  #DEFINE_ALIAS
 from .conv import conv2d  #DEFINE_ALIAS
-from .conv import conv2d_transpose  #DEFINE_ALIAS
+from .conv import conv_transpose2d  #DEFINE_ALIAS
 from .conv import conv3d  #DEFINE_ALIAS
-from .conv import conv3d_transpose  #DEFINE_ALIAS
+from .conv import conv_transpose3d  #DEFINE_ALIAS
 from .extension import add_position_encoding  #DEFINE_ALIAS
 # from .extension import autoincreased_step_counter        #DEFINE_ALIAS
 from .extension import continuous_value_model  #DEFINE_ALIAS
@@ -119,6 +133,8 @@ from .lod import hash  #DEFINE_ALIAS
 # from .lod import dynamic_gru        #DEFINE_ALIAS
 # from .lod import dynamic_lstm        #DEFINE_ALIAS
 # from .lod import dynamic_lstmp        #DEFINE_ALIAS
+from .loss import binary_cross_entropy  #DEFINE_ALIAS
+from .loss import binary_cross_entropy_with_logits  #DEFINE_ALIAS
 from .loss import bpr_loss  #DEFINE_ALIAS
 from .loss import center_loss  #DEFINE_ALIAS
 from .loss import cross_entropy  #DEFINE_ALIAS
@@ -139,23 +155,37 @@ from .loss import sampled_softmax_with_cross_entropy  #DEFINE_ALIAS
 from .loss import sigmoid_cross_entropy_with_logits  #DEFINE_ALIAS
 from .loss import sigmoid_focal_loss  #DEFINE_ALIAS
 from .loss import smooth_l1  #DEFINE_ALIAS
+from .loss import smooth_l1_loss  #DEFINE_ALIAS
 from .loss import softmax_with_cross_entropy  #DEFINE_ALIAS
 from .loss import square_error_cost  #DEFINE_ALIAS
 from .loss import ssd_loss  #DEFINE_ALIAS
 from .loss import teacher_student_sigmoid_loss  #DEFINE_ALIAS
-# from .norm import batch_norm        #DEFINE_ALIAS
+from .loss import ctc_loss  #DEFINE_ALIAS
 # from .norm import data_norm        #DEFINE_ALIAS
 # from .norm import group_norm        #DEFINE_ALIAS
-# from .norm import instance_norm        #DEFINE_ALIAS
 from .norm import l2_normalize  #DEFINE_ALIAS
-# from .norm import layer_norm        #DEFINE_ALIAS
+from .norm import batch_norm  #DEFINE_ALIAS
+from .norm import instance_norm  #DEFINE_ALIAS
+from .norm import layer_norm  #DEFINE_ALIAS
 from .norm import lrn  #DEFINE_ALIAS
 from .norm import normalize  #DEFINE_ALIAS
 # from .norm import spectral_norm        #DEFINE_ALIAS
+from .pooling import max_pool1d  #DEFINE_ALIAS
+from .pooling import avg_pool1d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool1d  #DEFINE_ALIAS
+from .pooling import adaptive_avg_pool1d  #DEFINE_ALIAS
 from .pooling import pool2d  #DEFINE_ALIAS
 from .pooling import pool3d  #DEFINE_ALIAS
 from .pooling import adaptive_pool2d  #DEFINE_ALIAS
 from .pooling import adaptive_pool3d  #DEFINE_ALIAS
+from .rnn import rnn  #DEFINE_ALIAS
+from .rnn import birnn  #DEFINE_ALIAS
+from .pooling import avg_pool2d  #DEFINE_ALIAS
+from .pooling import max_pool2d  #DEFINE_ALIAS
+from .pooling import avg_pool3d  #DEFINE_ALIAS
+from .pooling import max_pool3d  #DEFINE_ALIAS
+from .pooling import adaptive_avg_pool2d  #DEFINE_ALIAS
+from .pooling import adaptive_avg_pool3d  #DEFINE_ALIAS
 # from .rnn import gru_unit        #DEFINE_ALIAS
 # from .rnn import lstm        #DEFINE_ALIAS
 # from .rnn import lstm_unit        #DEFINE_ALIAS
@@ -167,7 +197,7 @@ from .vision import box_clip  #DEFINE_ALIAS
 from .vision import box_coder  #DEFINE_ALIAS
 from .vision import box_decoder_and_assign  #DEFINE_ALIAS
 from .vision import collect_fpn_proposals  #DEFINE_ALIAS
-# from .vision import deformable_conv        #DEFINE_ALIAS
+# from .vision import deformable_conv  #DEFINE_ALIAS
 from .vision import deformable_roi_pooling  #DEFINE_ALIAS
 from .vision import density_prior_box  #DEFINE_ALIAS
 from .vision import detection_output  #DEFINE_ALIAS
@@ -176,10 +206,10 @@ from .vision import fsp_matrix  #DEFINE_ALIAS
 from .vision import generate_mask_labels  #DEFINE_ALIAS
 from .vision import generate_proposal_labels  #DEFINE_ALIAS
 from .vision import generate_proposals  #DEFINE_ALIAS
-from .vision import grid_sampler  #DEFINE_ALIAS
+from .vision import grid_sample  #DEFINE_ALIAS
 from .vision import image_resize  #DEFINE_ALIAS
 from .vision import image_resize_short  #DEFINE_ALIAS
-# from .vision import multi_box_head        #DEFINE_ALIAS
+# from .vision import multi_box_head  #DEFINE_ALIAS
 from .vision import pixel_shuffle  #DEFINE_ALIAS
 from .vision import prior_box  #DEFINE_ALIAS
 from .vision import prroi_pool  #DEFINE_ALIAS
@@ -196,3 +226,4 @@ from .vision import shuffle_channel  #DEFINE_ALIAS
 from .vision import space_to_depth  #DEFINE_ALIAS
 from .vision import yolo_box  #DEFINE_ALIAS
 from .vision import yolov3_loss  #DEFINE_ALIAS
+from .input import one_hot  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 61476a6172941aa14b33c82f92d6075f0e452f02..ffedb027330bda94db86dc0943a5c4a7281f254f 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -17,18 +17,12 @@ from ...fluid.layers import brelu  #DEFINE_ALIAS
 from ...fluid.layers import erf  #DEFINE_ALIAS
 from ...fluid.layers import hard_sigmoid  #DEFINE_ALIAS
 from ...fluid.layers import hard_swish  #DEFINE_ALIAS
-from ...fluid.layers import leaky_relu  #DEFINE_ALIAS
 from ...fluid.layers import maxout  #DEFINE_ALIAS
-from ...fluid.layers import relu6  #DEFINE_ALIAS
-from ...fluid.layers import selu  #DEFINE_ALIAS
 from ...fluid.layers import soft_relu  #DEFINE_ALIAS
-from ...fluid.layers import softplus  #DEFINE_ALIAS
-from ...fluid.layers import softshrink  #DEFINE_ALIAS
-from ...fluid.layers import softsign  #DEFINE_ALIAS
 from ...fluid.layers import swish  #DEFINE_ALIAS
 from ...fluid.layers import sigmoid  #DEFINE_ALIAS
-from ...fluid.layers import tanh_shrink  #DEFINE_ALIAS
 from ...fluid.layers import thresholded_relu  #DEFINE_ALIAS
+from ...tensor.math import tanh  #DEFINE_ALIAS
 
 __all__ = [
     'brelu',
@@ -36,13 +30,14 @@ __all__ = [
     'erf',
     'gelu',
     'hardshrink',
+    'hardtanh',
     'hard_sigmoid',
     'hard_swish',
     'hsigmoid',
     'leaky_relu',
     'logsigmoid',
     'maxout',
-    #       'prelu',
+    'prelu',
     'relu',
     'relu6',
     'selu',
@@ -53,9 +48,10 @@ __all__ = [
     'softsign',
     'sigmoid',
     'swish',
-    'tanh_shrink',
+    'tanh',
+    'tanhshrink',
     'thresholded_relu',
-    'log_softmax'
+    'log_softmax',
 ]
 
 import warnings
@@ -70,7 +66,7 @@ def elu(x, alpha=1.0, name=None):
     """
     elu activation.
 
-    ..  math::
+    .. math::
 
         elu(x) = max(0, x) + min(0, \\alpha * (e^{x}-1))
 
@@ -86,16 +82,16 @@ def elu(x, alpha=1.0, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.nn.functional as F
-        import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
-        out = F.elu(x, alpha=0.2) 
-        # [[-0.12642411  6.        ]
-        #  [ 1.          15.6      ]]
+            x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
+            out = F.elu(x, alpha=0.2) 
+            # [[-0.12642411  6.        ]
+            #  [ 1.          15.6      ]]
     """
 
     if in_dygraph_mode():
@@ -117,10 +113,15 @@ def gelu(x, approximate=False, name=None):
     gelu activation.
 
     if approximate is True
-    ..  math::
+
+    .. math::
+
         gelu(x) = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+
     else
-    ..  math::
+
+    .. math::
+
         gelu(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
     
     Parameters:
@@ -135,23 +136,15 @@ def gelu(x, approximate=False, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.nn.functional as F
-        import numpy as np
-
-        paddle.disable_static()
-
-        data = np.random.randn(2, 3).astype("float32")
-        x = paddle.to_tensor(data)
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
 
-        out = F.gelu(x)
+            paddle.disable_static()
 
-        data
-        # array([[ 0.87165993, -1.0541513 , -0.37214822],
-        #         [ 0.15647964,  0.32496083,  0.33045998]], dtype=float32)
-        out
-        # array([[ 0.70456535, -0.15380788, -0.13207214],
-        #        [ 0.08796856,  0.20387867,  0.2080159 ]], dtype=float32)
+            x = paddle.to_tensor(np.array([[-1, 0.5],[1, 1.5]]))
+            out1 = F.gelu(x) # [-0.158655 0.345731 0.841345 1.39979]
+            out2 = F.gelu(x, True) # [-0.158808 0.345714 0.841192 1.39957]
     """
 
     if in_dygraph_mode():
@@ -175,13 +168,13 @@ def hardshrink(x, threshold=0.5, name=None):
     .. math::
 
         hardshrink(x)=
-            \left\{
-            \begin{aligned}
-            &x, & & if \ x > threshold \\
-            &x, & & if \ x < -threshold \\
-            &0, & & if \ others
-            \end{aligned}
-            \right.
+            \\left\\{
+            \\begin{aligned}
+            &x, & & if \\ x > threshold \\\\
+            &x, & & if \\ x < -threshold \\\\
+            &0, & & if \\ others
+            \\end{aligned}
+            \\right.
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -193,17 +186,16 @@ def hardshrink(x, threshold=0.5, name=None):
         A Tensor with the same data type and shape as ``x`` .
 
     Examples:
-
         .. code-block:: python
 
-        import paddle
-        import paddle.nn.functional as F
-        import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        x = paddle.to_variable(np.array([-1, 0.3, 2.5]))
-        out = F.hardshrink(x) # [-1., 0., 2.5]
+            x = paddle.to_tensor(np.array([-1, 0.3, 2.5]))
+            out = F.hardshrink(x) # [-1., 0., 2.5]
 
     """
     if in_dygraph_mode():
@@ -221,6 +213,58 @@ def hardshrink(x, threshold=0.5, name=None):
     return out
 
 
+def hardtanh(x, min=-1.0, max=1.0, name=None):
+    """
+    hardtanh activation
+
+    .. math::
+
+        hardtanh(x)= \\begin{cases}
+                        max, \\text{if } x > max \\\\
+                        min, \\text{if } x < min \\\\
+                        x,  \\text{otherwise}
+                      \\end{cases}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        min (float, optional): The minimum value of the linear region range. Default is -1.
+        max (float, optional): The maximum value of the linear region range. Default is 1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1.5, 0.3, 2.5]))
+            out = F.hardtanh(x) # [-1., 0.3, 1.]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.brelu(x, 't_min', min, 't_max', max)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'hardtanh')
+
+    helper = LayerHelper('hardtanh', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='brelu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'t_min': min,
+               't_max': max})
+    return out
+
+
 def hsigmoid(input,
              label,
              weight,
@@ -278,7 +322,6 @@ def hsigmoid(input,
         Variable: A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as :attr:`input`.
 
     Examples:
-
         .. code-block:: python
 
             from paddle import fluid, nn
@@ -344,11 +387,137 @@ def hsigmoid(input,
     return out
 
 
+def leaky_relu(x, negative_slope=0.01, name=None):
+    """
+    leaky_relu activation
+
+    .. math::
+        leaky\\_relu(x)=
+            \\left\\{
+            \\begin{aligned}
+            &x, & & if \\ x >= 0 \\\\
+            &negative\_slope * x, & & otherwise \\\\
+            \\end{aligned}
+            \\right. \\\\
+
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        negative_slope (float, optional): Slope of the activation function at
+            :math:`x < 0` . Default is 0.01.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-2, 0, 1], 'float32'))
+            out = F.leaky_relu(x) # [-0.02, 0., 1.]
+
+    """
+    if in_dygraph_mode():
+        return core.ops.leaky_relu(x, 'alpha', negative_slope)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'leaky_relu')
+    helper = LayerHelper('leaky_relu', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='leaky_relu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': negative_slope})
+    return out
+
+
+def prelu(x, weight, name=None):
+    """
+    prelu activation.
+
+    .. math::
+
+        prelu(x) = max(0, x) + weight * min(0, x)
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        weight (Tensor): The learnable parameter with data type same as ``x``.
+            The weight shape is [1] or [in], where `in` is the input channel of ``x``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            data = np.array([[[[-2.0,  3.0, -4.0,  5.0],
+                               [ 3.0, -4.0,  5.0, -6.0],
+                               [-7.0, -8.0,  8.0,  9.0]],
+                              [[ 1.0, -2.0, -3.0,  4.0],
+                               [-5.0,  6.0,  7.0, -8.0],
+                               [ 6.0,  7.0,  8.0,  9.0]]]], 'float32')
+            x = paddle.to_tensor(data)
+            w = paddle.to_tensor(np.array([0.25]).astype('float32'))
+            out = F.prelu(x, w)
+            # [[[[-0.5 ,  3.  , -1.  ,  5.  ],
+            #    [ 3.  , -1.  ,  5.  , -1.5 ],
+            #    [-1.75, -2.  ,  8.  ,  9.  ]],
+            #   [[ 1.  , -0.5 , -0.75,  4.  ],
+            #    [-1.25,  6.  ,  7.  , -2.  ],
+            #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
+    """
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'prelu')
+    check_variable_and_dtype(weight, 'weight',
+                             ['float16', 'float32', 'float64'], 'prelu')
+
+    helper = LayerHelper('prelu', **locals())
+    assert len(weight.shape
+               ) == 1, "The dim count of weight shape should be 1 in prelu()."
+
+    # NOTE(): The input of this API should be ``N,C,...`` format, 
+    # which means x.shape[0] is batch_size and x.shape[0] is channel.
+    mode = 'all'
+    if weight.shape[0] > 1:
+        assert len(
+            x.shape
+        ) > 1, "The dim count of x should be equal or larger than 2 in prelu() when weight shape is not [1]."
+        assert weight.shape[0] == x.shape[
+            1], "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
+        mode = 'channel'
+
+    if in_dygraph_mode():
+        return core.ops.prelu(x, weight, 'mode', mode)
+
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type="prelu",
+        inputs={"X": x,
+                "Alpha": weight},
+        outputs={"Out": out},
+        attrs={"mode": mode})
+    return out
+
+
 def relu(x, name=None):
     """
-    ReLU Activation.
+    relu activation.
 
-    .. math:
+    .. math::
 
         out = max(x, 0)
 
@@ -363,14 +532,14 @@ def relu(x, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.nn.functional as F
-        import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
-        out = F.relu(x) # [0., 0., 1.]
+            x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
+            out = F.relu(x) # [0., 0., 1.]
     """
 
     if in_dygraph_mode():
@@ -387,9 +556,9 @@ def logsigmoid(x, name=None):
     """
     logsigmoid activation.
 
-    .. math:
+    .. math::
 
-        logsigmoid(x) = \log \frac{1}{1 + e^{-x}}
+        logsigmoid(x) = log \\frac{1}{1 + e^{-x}}
     
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -402,14 +571,14 @@ def logsigmoid(x, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.nn.functional as F
-        import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
-        out = F.logsigmoid(x) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
+            x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+            out = F.logsigmoid(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
     """
 
     if in_dygraph_mode():
@@ -423,7 +592,110 @@ def logsigmoid(x, name=None):
     return out
 
 
-def softmax(x, axis=-1, name=None):
+def relu6(x, name=None):
+    """
+    relu6 activation
+
+    .. math::
+
+        relu6(x) = min(max(0,x), 6)
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1, 0.3, 6.5]))
+            out = F.relu6(x) # [0, 0.3, 6]
+    """
+    threshold = 6.0
+    if in_dygraph_mode():
+        return core.ops.relu6(x, 'threshold', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu6')
+    helper = LayerHelper('relu6', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='relu6',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold})
+    return out
+
+
+def selu(x,
+         scale=1.0507009873554804934193349852946,
+         alpha=1.6732632423543772848170429916717,
+         name=None):
+    """
+    selu activation
+
+    .. math::
+
+        selu(x)= scale *
+                 \\begin{cases}
+                   x, \\text{if } x > 0 \\\\
+                   alpha * e^{x} - alpha, \\text{if } x <= 0
+                 \\end{cases}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        scale (float, optional): The value of scale(must be greater than 1.0) for selu. Default is 1.0507009873554804934193349852946
+        alpha (float, optional): The value of alpha(must be no less than zero) for selu. Default is 1.6732632423543772848170429916717
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[0.0, 1.0],[2.0, 3.0]]))
+            out = F.selu(x) # [[0, 1.050701],[2.101402, 3.152103]]
+    """
+    if scale <= 1.0:
+        raise ValueError(
+            "The scale must be greater than 1.0. Received: {}.".format(scale))
+
+    if alpha < 0:
+        raise ValueError(
+            "The alpha must be no less than zero. Received: {}.".format(alpha))
+
+    if in_dygraph_mode():
+        return core.ops.selu(x, 'scale', scale, 'alpha', alpha)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'selu')
+    helper = LayerHelper('selu', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='selu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'scale': scale,
+               'alpha': alpha})
+    return out
+
+
+def softmax(x, axis=-1, dtype=None, name=None):
     """
     This operator implements the softmax layer. The calculation process is as follows:
 
@@ -450,7 +722,7 @@ def softmax(x, axis=-1, name=None):
 
     .. math::
 
-        out[i, j] = \\frac{\exp(x[i, j])}{\sum_j(exp(x[i, j])}
+        softmax[i, j] = \\frac{\\exp(x[i, j])}{\\sum_j(exp(x[i, j])}
 
     Example:
 
@@ -499,12 +771,153 @@ def softmax(x, axis=-1, name=None):
                          [0.26762315, 0.26762315, 0.26762315, 0.26762315],
                          [0.72747516, 0.72747516, 0.72747516, 0.72747516]]]
 
-    Args:
-        x (Tensor): The input multi-dimension Tensor with data type float32, float64.
-        axis (int, optional): The axis along which to perform softmax calculations.
-            It should be in range [-D, D), where D is the dimensions of ``x`` .
-            When ``axis`` < 0, it works the same way as :math:`axis + D` .
-            Default is -1.
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int, optional): The axis along which to perform log_softmax
+            calculations. It should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` < 0, it works the same way as
+            :math:`axis + D` . Default is -1.
+        dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
+            type of the output tensor. If dtype is specified, ``x`` is casted
+            to ``dtype`` before the operation is performed. This is useful for 
+            preventing data type overflows. Supported dtype: float32, float64.
+            If ``dtype`` is None, the output Tensor has the same dtype as x.
+            Default is None.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same shape and data type (use ``dtype`` if it is
+        specified) as x.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = np.array([[[2.0, 3.0, 4.0, 5.0],
+                        [3.0, 4.0, 5.0, 6.0],
+                        [7.0, 8.0, 8.0, 9.0]],
+                        [[1.0, 2.0, 3.0, 4.0],
+                        [5.0, 6.0, 7.0, 8.0],
+                        [6.0, 7.0, 8.0, 9.0]]], 'float32')
+            x = paddle.to_tensor(x)
+            out1 = F.softmax(x)
+            out2 = F.softmax(x, dtype='float64')
+            # out1's data type is float32; out2's data type is float64
+            # out1 and out2's value is as follows:
+            # [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+            # [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+    """
+
+    if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+    use_cudnn = True if axis is -1 else False
+
+    if in_dygraph_mode():
+        outs_cast = x if dtype is None \
+            else core.ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        return core.ops.softmax(outs_cast, 'axis', axis, 'use_cudnn', use_cudnn)
+
+    if dtype is None:
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'softmax')
+    else:
+        check_dtype(dtype, 'dtype', ['float32', 'float64'], 'softmax',
+                    'If dtype is not None, it only support float32 or float64.')
+
+    helper = LayerHelper("softmax", **locals())
+    outs_cast = x
+    if dtype is not None:
+        outs_cast = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='cast',
+            inputs={'X': x},
+            outputs={'Out': outs_cast},
+            attrs={'in_dtype': x.dtype,
+                   'out_dtype': dtype})
+
+    outs_softmax = helper.create_variable_for_type_inference(outs_cast.dtype)
+    helper.append_op(
+        type='softmax',
+        inputs={'X': outs_cast},
+        outputs={'Out': outs_softmax},
+        attrs={'axis': axis,
+               'use_cudnn': use_cudnn})
+
+    return outs_softmax
+
+
+def softplus(x, beta=1, threshold=20, name=None):
+    """
+    softplus activation
+
+    .. math::
+
+        softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\
+        \\text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        beta (float, optional): The value of beta for softplus. Default is 1
+        threshold (float, optional): The value of threshold for softplus. Default is 20
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
+    """
+    if in_dygraph_mode():
+        return core.ops.softplus(x, 'beta', beta, 'threshold', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'softplus')
+    helper = LayerHelper('softplus', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='softplus',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'beta': beta,
+               'threshold': threshold})
+    return out
+
+
+def softshrink(x, threshold=0.5, name=None):
+    """
+    softshrink activation
+
+    .. math::
+
+        softshrink(x)= \\begin{cases}
+                        x - threshold, \\text{if } x > threshold \\\\
+                        x + threshold, \\text{if } x < -threshold \\\\
+                        0,  \\text{otherwise}
+                      \\end{cases}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        threshold (float, optional): The value of threshold(must be no less than zero) for softplus. Default is 0.5
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
@@ -512,31 +925,113 @@ def softmax(x, axis=-1, name=None):
         A Tensor with the same data type and shape as ``x`` .
 
     Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
 
+            x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
+            out = F.softshrink(x) # [-0.4, 0, 0, 0.3]
+    """
+    if threshold < 0:
+        raise ValueError(
+            "The threshold must be no less than zero. Received: {}.".format(
+                threshold))
+
+    if in_dygraph_mode():
+        return core.ops.softshrink(x, 'lambda', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'softshrink')
+    helper = LayerHelper('softshrink', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='softshrink',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'lambda': threshold})
+    return out
+
+
+def softsign(x, name=None):
+    """
+    softsign activation
+
+    .. math::
+
+        softsign(x) = \\frac{x}{1 + |x|}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.nn.functional as F
-        import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        x = np.array([[[2.0, 3.0, 4.0, 5.0],
-                       [3.0, 4.0, 5.0, 6.0],
-                       [7.0, 8.0, 8.0, 9.0]],
-                      [[1.0, 2.0, 3.0, 4.0],
-                       [5.0, 6.0, 7.0, 8.0],
-                       [6.0, 7.0, 8.0, 9.0]]], 'float32')
-        x = paddle.to_tensor(x)
-        out = F.softmax(x)
-        # [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-        #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-        #   [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
-        # [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-        #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-        #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
+    """
+    if in_dygraph_mode():
+        return core.ops.softsign(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'softsign')
+    helper = LayerHelper('softsign', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='softsign', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
+def tanhshrink(x, name=None):
+    """
+    tanhshrink activation
+
+    .. math::
+
+        tanhshrink(x) = x - tanh(x)
+
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
     """
-    return paddle.fluid.layers.softmax(input=x, axis=axis, name=name)
+    if in_dygraph_mode():
+        return core.ops.tanh_shrink(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'tanhshrink')
+    helper = LayerHelper('tanh_shrink', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='tanh_shrink', inputs={'X': x}, outputs={'Out': out})
+    return out
 
 
 def log_softmax(x, axis=-1, dtype=None, name=None):
@@ -546,8 +1041,8 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
 
     .. math::
 
-        Out[i, j] = log(softmax(x)) 
-                  = log(\\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])})
+        log\\_softmax[i, j] = log(softmax(x))
+                            = log(\\frac{\exp(X[i, j])}{\\sum_j(exp(X[i, j])})
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -571,33 +1066,31 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.nn.functional as F
-        import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
+
+            x = np.array([[[-2.0, 3.0, -4.0, 5.0],
+                            [3.0, -4.0, 5.0, -6.0],
+                            [-7.0, -8.0, 8.0, 9.0]],
+                            [[1.0, -2.0, -3.0, 4.0],
+                            [-5.0, 6.0, 7.0, -8.0],
+                            [6.0, 7.0, 8.0, 9.0]]], 'float32')
+            x = paddle.to_tensor(x)
+            out1 = F.log_softmax(x)
+            out2 = F.log_softmax(x, dtype='float64')
+            # out1's data type is float32; out2's data type is float64
+            # out1 and out2's value is as follows:
+            # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
+            #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
+            #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
+            #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
+            #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
+            #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
+    """
 
-        x = np.array([[[-2.0, 3.0, -4.0, 5.0],
-                        [3.0, -4.0, 5.0, -6.0],
-                        [-7.0, -8.0, 8.0, 9.0]],
-                        [[1.0, -2.0, -3.0, 4.0],
-                        [-5.0, 6.0, 7.0, -8.0],
-                        [6.0, 7.0, 8.0, 9.0]]], 'float32')
-        x = paddle.to_tensor(x)
-        out1 = F.log_softmax(x)
-        out2 = F.log_softmax(x, dtype='float64')
-        # out1's data type is float32; out2's data type is float64
-        # out1 and out2's value is as follows:
-        # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
-        #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
-        #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
-        #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
-        #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
-        #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
-    """
-
-    if axis is None:
-        axis = -1
     if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index fe41cb6e64c34f34add3c0652ab5b30efe958161..623af3277fba0e29fb77b02c711e258602f1f75a 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -13,26 +13,45 @@
 # limitations under the License.
 
 import warnings
+import paddle
+from ...fluid.framework import in_dygraph_mode, default_main_program
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.layers.tensor import Variable, fill_constant
-
+from paddle.fluid.layers.tensor import Variable, fill_constant, zeros, concat
+from ...fluid.layers import core
+from ...fluid import dygraph_utils
 # TODO: define the common functions to build a neural network  
-from ...fluid.layers import dropout  #DEFINE_ALIAS
 from ...fluid.layers import label_smooth  #DEFINE_ALIAS
 from ...fluid import one_hot  #DEFINE_ALIAS
-from ...fluid.layers import pad  #DEFINE_ALIAS
 from ...fluid.layers import pad2d  #DEFINE_ALIAS
 from ...fluid.layers import unfold  #DEFINE_ALIAS
 from ...fluid.layers import assign  #DEFINE_ALIAS
+from ...fluid.layers import squeeze  #DEFINE_ALIAS
+from ...fluid.layers import unsqueeze  #DEFINE_ALIAS
+from ...fluid.layers import elementwise_mul  #DEFINE_ALIAS
+from ...tensor import clip
+from ...tensor import sum
+from ...tensor import sqrt
+from ...tensor import sum  #DEFINE_ALIAS
+from ...tensor import sqrt  #DEFINE_ALIAS
+from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
+from ...fluid.framework import Variable, in_dygraph_mode, _varbase_creator
 
 #from ...fluid.layers import fc  #DEFINE_ALIAS
 from ...fluid.layers import pad_constant_like  #DEFINE_ALIAS
+from ...fluid.framework import in_dygraph_mode
+from ...fluid import core, dygraph_utils
+from ...fluid import core, layers
+from ...fluid.data_feeder import check_variable_and_dtype
 
 __all__ = [
     'dropout',
+    'dropout2d',
+    'dropout3d',
+    'alpha_dropout',
     #       'embedding',
     #       'fc',
     'label_smooth',
+    'linear',
     'one_hot',
     'pad',
     'pad_constant_like',
@@ -40,29 +59,29 @@ __all__ = [
     'unfold',
     #       'bilinear_tensor_product',
     'assign',
-    'interpolate'
+    'interpolate',
+    'upsample',
+    'bilinear',
+    'cosine_similarity',
 ]
 
 
-def interpolate(input,
+def interpolate(x,
                 size=None,
                 scale_factor=None,
                 mode='nearest',
                 align_corners=False,
-                align_mode=1,
+                align_mode=0,
                 data_format='NCHW',
                 name=None):
     """
-	:alias_main: paddle.nn.functional.interpolate
-	:alias: paddle.nn.functional.interpolate,paddle.nn.functional.common.interpolate
 
     This op resizes a batch of images.
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
     and the resizing only applies on the three dimensions(depth, height and width).
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the
-    future and only use :attr:`out_shape` instead.
+
     Supporting resample methods:
         'linear' : Linear interpolation
         'bilinear' : Bilinear interpolation
@@ -87,7 +106,7 @@ def interpolate(input,
     interpolating functions of three variables (e.g. D-direction,
     H-direction and W-direction in this op) on a rectilinear 3D grid.
     The linear interpolation is performed on three directions.
-    Align_corners and align_mode are optional parameters,the calculation method
+    align_corners and align_mode are optional parameters,the calculation method
     of interpolation can be selected by them.
 
     Bicubic interpolation is an extension of cubic interpolation for interpolating
@@ -117,18 +136,12 @@ def interpolate(input,
                 W_out = W_{in} * scale_{factor}
         
         Nearest neighbor interpolation:
-          if:
+
               align_corners = False
               input : (N,C,H_in,W_in)
               output: (N,C,H_out,W_out) where:
               H_out = floor (H_{in} * scale_{factor})
               W_out = floor (W_{in} * scale_{factor})
-          else:
-              align_corners = True
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = round(H_{in} * scale_{factor})
-              W_out = round(W_{in} * scale_{factor})
 
         Bilinear interpolation:
           if:
@@ -187,22 +200,22 @@ def interpolate(input,
     https://en.wikipedia.org/wiki/Bicubic_interpolation
     
     Parameters:
-        input (Variable): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
+        x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
-        size (list|tuple|Variable|None): Output shape of image resize
+        size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Variable|None): The multiplier for the input height or width. At
+        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
              least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.Has to match input size if it is a list.
              Default: None.
         mode (str): The resample method. It supports 'linear', 'nearest', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
         align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the
                                input and output tensors are aligned, preserving the values at the
-                               corner pixels.
+                               corner pixels.This only has an effect when 'linear', 'bilinear', 'bicubic' or 'trilinear'.
                                Default: False
         align_mode(int)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the formula in the example above,
                             it can be \'0\' for src_idx = scale_factor*(dst_indx+0.5)-0.5 , can be \'1\' for
@@ -220,7 +233,7 @@ def interpolate(input,
         A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
         or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
     Raises:
-        TypeError: size should be a list or tuple or Variable.
+        TypeError: size should be a list or tuple or Tensor.
         ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
                     'trilinear', 'bicubic', or 'nearest' currently.
         ValueError: 'linear' only support 3-D tensor.
@@ -238,53 +251,27 @@ def interpolate(input,
     Examples:
         .. code-block:: python
 
-	    #declarative mode
 	    import paddle
 	    import numpy as np
-	    input = fluid.data(name="input", shape=[None,3,6,10])
-	    #1
-	    output = paddle.nn.functional.interpolate(input=input, size=[12,12])
-	    #2
-	    #x = np.array([2]).astype("int32")
-	    #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
-	    #fluid.layers.assign(input=x, output=dim1)
-	    #output = paddle.nn.functional.interpolate(input=input, size=[12,dim1])
-	    #3
-	    #x = np.array([3,12]).astype("int32")
-	    #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
-	    #fluid.layers.assign(input=x, output=shape_tensor)
-	    #output = paddle.nn.functional.interpolate(input=input, size=shape_tensor)
-	    #4
-	    #x = np.array([0.5]).astype("float32")
-	    #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
-	    #fluid.layers.assign(x,scale_tensor)
-	    #output = paddle.nn.functional.interpolate(input=input, scale_factor=scale_tensor)
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
-
-	    input_data = np.random.rand(2,3,6,10).astype("float32")
-	    output_data = exe.run(fluid.default_main_program(),
-                feed={"input":input_data},
-                fetch_list=[output],
-                return_numpy=True)
-
-	    print(output_data[0].shape)
-	    #1
-	    # (2, 3, 12, 12)
-	    #2
-	    # (2, 3, 12, 2)
-	    #3
-	    # (2, 3, 3, 12)
-	    #4
-	    # (2, 3, 3, 5)
-	    #imperative mode
-	    import paddle.fluid.dygraph as dg
-	    with dg.guard(place) as g:
-    		input = dg.to_variable(input_data)
-    		output = paddle.nn.functional.interpolate(input=input, size=[12,12])
-    		print(output.shape)
-		# [2L, 3L, 12L, 12L]
+            import paddle.nn.functional as F
+            paddle.disable_static()
+            
+            # given out size
+            input_data = np.random.rand(2,3,6,10).astype("float32")
+            x = paddle.to_tensor(input_data)
+            output_1 = F.interpolate(x=x, size=[12,12])
+    	    print(output_1.shape)
+	    # [2L, 3L, 12L, 12L]
+            
+            # given scale
+            output_2 = F.interpolate(x=x, scale_factor=[2,1])
+            print(output_2.shape)
+            # [2L, 3L, 12L, 10L]
+            
+            # bilinear interp
+            output_3 = F.interpolate(x=x, scale_factor=[2,1], mode="bilinear")
+            print(output_2.shape)
+            # [2L, 3L, 12L, 10L]
     """
     data_format = data_format.upper()
     resample = mode.upper()
@@ -302,13 +289,13 @@ def interpolate(input,
             "The 'resample' of image_resize can only be 'linaer', 'bilinear', 'trilinear', "
             " 'bicubic' or 'nearest' currently.")
 
-    if resample in ['LINEAR'] and len(input.shape) != 3:
+    if resample in ['LINEAR'] and len(x.shape) != 3:
         raise ValueError("'linear' only support 3-D tensor.")
 
-    if resample in ['BILINEAR', 'NEAREST', 'BICUBIC'] and len(input.shape) != 4:
+    if resample in ['BILINEAR', 'NEAREST', 'BICUBIC'] and len(x.shape) != 4:
         raise ValueError(
             "'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.")
-    if resample == 'TRILINEAR' and len(input.shape) != 5:
+    if resample == 'TRILINEAR' and len(x.shape) != 5:
         raise ValueError("'trilinear'only support 5-D tensor.")
 
     if size is None and scale_factor is None:
@@ -319,19 +306,21 @@ def interpolate(input,
 
     if align_mode != 0 and align_mode != 1:
         raise ValueError("align_mode can only be 0 or 1")
-
-    helper = LayerHelper('{}_interp'.format(resample_type), **locals())
+    if align_corners != 0 and resample == 'NEAREST':
+        raise ValueError(
+            "align_corners option can only be set with the interpolating modes: linear | bilinear | bicubic | trilinear"
+        )
+    helper = LayerHelper('{}_interp_v2'.format(resample_type), **locals())
     dtype = helper.input_dtype()
-
-    if len(input.shape) == 3 and data_format not in ['NCW', 'NWC']:
+    if len(x.shape) == 3 and data_format not in ['NCW', 'NWC']:
         raise ValueError(
             "Got wrong value for param `data_format`: " + data_format +
             " received but only `NCW` or `NWC` supported for 3-D input.")
-    elif len(input.shape) == 4 and data_format not in ['NCHW', 'NHWC']:
+    elif len(x.shape) == 4 and data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
             "Got wrong value for param `data_format`: " + data_format +
             " received but only `NCHW` or `NHWC` supported for 4-D input.")
-    elif len(input.shape) == 5 and data_format not in ['NCDHW', 'NDHWC']:
+    elif len(x.shape) == 5 and data_format not in ['NCDHW', 'NDHWC']:
         raise ValueError(
             "Got wrong value for param `data_format`: " + data_format +
             " received but only `NCDHW` or `NDHWC` supported for 5-D input.")
@@ -344,7 +333,10 @@ def interpolate(input,
     if data_format == 'NHWC' or data_format == 'NDHWC' or data_format == 'NWC':
         data_layout = 'NHWC'
 
-    inputs = {"X": input}
+    if resample == 'NEAREST':
+        align_corners = False
+
+    inputs = {"X": x}
     attrs = {
         "out_d": -1,
         "out_h": -1,
@@ -393,7 +385,7 @@ def interpolate(input,
                         size_list.append(dim)
                 inputs['SizeTensor'] = new_size_tensor
 
-            if len(input.shape) == 3:
+            if len(x.shape) == 3:
                 if len(out_shape) != 1:
                     raise ValueError(
                         "out_shape length should be 2 for input 3-D tensor")
@@ -402,7 +394,7 @@ def interpolate(input,
                 else:
                     out_shape = list(map(int, out_shape))
                     attrs['out_w'] = out_shape[0]
-            if len(input.shape) == 4:
+            if len(x.shape) == 4:
                 if len(out_shape) != 2:
                     raise ValueError("out_shape length should be 2 for "
                                      "input 4-D tensor.")
@@ -413,7 +405,7 @@ def interpolate(input,
                     out_shape = list(map(int, out_shape))
                     attrs['out_h'] = out_shape[0]
                     attrs['out_w'] = out_shape[1]
-            if len(input.shape) == 5:
+            if len(x.shape) == 5:
                 if len(out_shape) != 3:
                     raise ValueError("out_shape length should be 3 for "
                                      "input 5-D tensor.")
@@ -434,15 +426,1023 @@ def interpolate(input,
         elif isinstance(scale, float) or isinstance(scale, int):
             if scale <= 0:
                 raise ValueError("Attr(scale) should be greater than zero.")
-            attrs['scale'] = float(scale)
+            scale_list = []
+            for i in range(len(x.shape) - 2):
+                scale_list.append(scale)
+            attrs['scale'] = list(map(float, scale_list))
+        elif isinstance(scale, list):
+            if len(scale) != len(x.shape) - 2:
+                raise ValueError("scale_shape length should be {} for "
+                                 "input {}-D tensor.".format(
+                                     len(x.shape) - 2, len(x.shape)))
+            for value in scale:
+                if value <= 0:
+                    raise ValueError("Attr(scale) should be greater than zero.")
+            attrs['scale'] = list(map(float, scale))
         else:
             raise TypeError(
-                "Attr(scale)'s type should be float, int or Variable.")
+                "Attr(scale)'s type should be float, int, list or Tensor.")
+
+    if in_dygraph_mode():
+        attr_list = []
+        for k, v in attrs.items():
+            attr_list.append(k)
+            attr_list.append(v)
+        dy_attr = tuple(attr_list)
 
+        if resample_type == "linear":
+            out = core.ops.linear_interp_v2(x, *dy_attr)
+        if resample_type == "bilinear":
+            out = core.ops.bilinear_interp_v2(x, *dy_attr)
+        if resample_type == "trilinear":
+            out = core.ops.trilinear_interp_v2(x, *dy_attr)
+        if resample_type == "nearest":
+            out = core.ops.nearest_interp_v2(x, *dy_attr)
+        if resample_type == "bicubic":
+            out = core.ops.bicubic_interp_v2(x, *dy_attr)
+        return out
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
-        type='{}_interp'.format(resample_type),
+        type='{}_interp_v2'.format(resample_type),
         inputs=inputs,
         outputs={"Out": out},
         attrs=attrs)
     return out
+
+
+def upsample(x,
+             size=None,
+             scale_factor=None,
+             mode='nearest',
+             align_corners=False,
+             align_mode=0,
+             data_format='NCHW',
+             name=None):
+    """
+    This op resizes a batch of images.
+    The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
+    or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
+    (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
+    and the resizing only applies on the three dimensions(depth, height and width).
+
+    Supporting resample methods:
+        'linear' : Linear interpolation
+        'bilinear' : Bilinear interpolation
+        'trilinear' : Trilinear interpolation
+        'nearest' : Nearest neighbor interpolation
+        'bicubic' : Bicubic interpolation
+    Linear interpolation is the method of using a line connecting two known quantities 
+    to determine the value of an unknown quantity between the two known quantities. 
+    
+    Nearest neighbor interpolation is to perform nearest neighbor interpolation
+    in both the 3rd dimension(in height direction) and the 4th dimension(in width
+    direction) on input tensor.
+    Bilinear interpolation is an extension of linear interpolation for
+    interpolating functions of two variables (e.g. H-direction and
+    W-direction in this op) on a rectilinear 2D grid. The key idea is
+    to perform linear interpolation first in one direction, and then
+    again in the other direction.
+    
+    Bicubic interpolation is an extension of cubic interpolation for interpolating
+    data points on a two-dimensional regular grid. The interpolated surface is
+    smoother than corresponding surfaces obtained by bilinear interpolation or
+    nearest-neighbor interpolation.
+    Trilinear interpolation is an extension of linear interpolation for
+    interpolating functions of three variables (e.g. D-direction,
+    H-direction and W-direction in this op) on a rectilinear 3D grid.
+    The linear interpolation is performed on three directions.
+    align_corners and align_mode are optional parameters,the calculation method
+    of interpolation can be selected by them.
+    Example:
+    .. code-block:: text
+        For scale_factor:
+            if align_corners = True && out_size > 1 :
+              scale_factor = (in_size-1.0)/(out_size-1.0)
+            else:
+              scale_factor = float(in_size/out_size)
+        Linear interpolation:
+            if:
+                align_corners = False , align_mode = 0
+                input : (N,C,W_in)
+                output: (N,C,W_out) where:
+                W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+            else:
+                input : (N,C,W_in)
+                output: (N,C,W_out) where:
+                W_out = W_{in} * scale_{factor}
+        Nearest neighbor interpolation:
+          if:
+              align_corners = False
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = floor (H_{in} * scale_{factor})
+              W_out = floor (W_{in} * scale_{factor})
+          else:
+              align_corners = True
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = round(H_{in} * scale_{factor})
+              W_out = round(W_{in} * scale_{factor})
+        
+        Bilinear interpolation:
+          if:
+              align_corners = False , align_mode = 0
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+          else:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+        Bicubic interpolation:
+          if:
+              align_corners = False
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+          else:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+        Trilinear interpolation:
+          if:
+              align_corners = False , align_mode = 0
+              input : (N,C,D_in,H_in,W_in)
+              output: (N,C,D_out,H_out,W_out) where:
+              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+          else:
+              input : (N,C,D_in,H_in,W_in)
+              output: (N,C,D_out,H_out,W_out) where:
+              D_out = D_{in} * scale_{factor}
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+    https://en.wikipedia.org/wiki/Linear_interpolation.
+    For details of linear interpolation, please refer to Wikipedia:
+    
+    For details of nearest neighbor interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
+    
+    For details of bilinear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Bilinear_interpolation.
+    
+    For details of bicubic interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Bicubic_interpolation
+    
+    For details of trilinear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Trilinear_interpolation.
+    
+    Parameters:
+        x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
+                          its data format is specified by :attr:`data_format`.
+        size (list|tuple|Tensor|None): Output shape of image resize
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
+             If a Tensor Variable, its dimensions size should be a 1.
+        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
+             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             Default: None.
+        mode (str): The resample method. It supports 'linear', 'nearest', 'bilinear',
+                       'bicubic' and 'trilinear' currently. Default: 'nearest'
+        align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the
+                               input and output tensors are aligned, preserving the values at the
+                               corner pixels.
+                               Default: False
+        align_mode(int)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the formula in the example above,
+                            it can be \'0\' for src_idx = scale_factor*(dst_indx+0.5)-0.5 , can be \'1\' for
+                            src_idx = scale_factor*dst_index.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
+            `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`. When it is `"NCHW"`, the data is stored
+            in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+    Returns:
+        A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
+        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
+        or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
+    Raises:
+        TypeError: size should be a list or tuple or Tensor.
+        ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
+                    'trilinear', 'bicubic', or 'nearest' currently.
+        ValueError: 'linear' only support 3-D tensor.
+        ValueError: 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        ValueError: 'trilinear' only support 5-D tensor.
+        ValueError: One of size and scale_factor must not be None.
+        ValueError: size length should be 1 for input 3-D tensor.
+        ValueError: size length should be 2 for input 4-D tensor.
+        ValueError: size length should be 3 for input 5-D tensor.
+        ValueError: scale_factor should be greater than zero.
+        TypeError: align_corners should be a bool value
+        ValueError: align_mode can only be '0' or '1'
+        ValueError: data_format can only be 'NCW', 'NWC', 'NCHW', 'NHWC', 'NCDHW' or 'NDHWC'.
+        Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+            import paddle.nn.functional as F
+            paddle.disable_static()
+
+            input = paddle.to_tensor(input_data)
+            output = F.upsample(input=input, size=[12,12])
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
+
+    """
+    return interpolate(x, size, scale_factor, mode, align_corners, align_mode,
+                       data_format)
+
+
+def bilinear(x1, x2, weight, bias=None, name=None):
+    """
+
+    This layer performs bilinear on two inputs.
+    See :ref:`api_nn_Bilinear` for details and output shape.
+
+    Parameters:
+       x1 (Tensor): the first input tensor, it's data type should be float32, float64.
+       x2 (Tensor): the second input tensor, it's data type should be float32, float64.
+       weight (Parameter): The learnable weights of this layer, shape is [out_features, in1_features, in2_features].
+       bias (Parameter, optional): The learnable bias(Bias) of this layer, shape is [1, out_features]. If it is set to None, no bias will be added to the output units. The default value is None.
+       name (str, optional): The default value is None. Normally there is no need for user
+           to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
+
+    Returns:
+       Tensor: A 2-D Tensor of shape [batch_size, out_features].
+
+    Examples:
+       .. code-block:: python
+
+        import paddle
+        import numpy
+        import paddle.nn.functional as F
+
+        paddle.disable_static()
+        x1 = numpy.random.random((5, 5)).astype('float32')
+        x2 = numpy.random.random((5, 4)).astype('float32')
+        w = numpy.random.random((1000, 5, 4)).astype('float32')
+        b = numpy.random.random((1, 1000)).astype('float32')
+
+        result = F.bilinear(paddle.to_tensor(x1), paddle.to_tensor(x2), paddle.to_tensor(w), paddle.to_tensor(b))           # result shape [5, 1000]
+
+    """
+
+    if in_dygraph_mode():
+        return core.ops.bilinear_tensor_product(x1, x2, weight, bias)
+
+    check_variable_and_dtype(x1, 'x1', ['float32', 'float64'], 'bilinear')
+    check_variable_and_dtype(x2, 'x2', ['float32', 'float64'], 'bilinear')
+
+    inputs = {"X": x1, "Y": x2, "Weight": weight}
+    if bias is not None:
+        inputs["Bias"] = bias
+
+    helper = LayerHelper("bilinear", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x1.dtype)
+
+    helper.append_op(
+        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out})
+
+    return out
+
+
+def dropout(x,
+            p=0.5,
+            axis=None,
+            training=True,
+            mode="upscale_in_train",
+            name=None):
+    """
+    Dropout is a regularization technique for reducing overfitting by preventing
+    neuron co-adaption during training. The dropout operator randomly sets the
+    outputs of some units to zero, while upscale others according to the given
+    dropout probability.
+
+    Args:
+        x (Tensor): The input tensor. The data type is float32 or float64.
+        p (float | int): Probability of setting units to zero. Default 0.5.
+        axis (int | list): The axis along which the dropout is performed. Default None.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        mode(str): ['upscale_in_train'(default) | 'downscale_in_infer']
+
+                           1. upscale_in_train(default), upscale the output at training time
+
+                              - train: out = input * mask / ( 1.0 - dropout_prob )
+                              - inference: out = input
+
+                           2. downscale_in_infer, downscale the output at inference
+
+                              - train: out = input * mask
+                              - inference: out = input * (1.0 - dropout_prob)
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout, has same shape and data type as `x` .
+
+    Examples:
+        We use ``p=0.5`` in the following description for simplicity.
+        1. When ``axis=None`` , this is commonly used dropout, which dropout each element of x randomly.
+            Let's see a simple case when x is a 2d tensor with shape 2*3:
+            [[1 2 3]
+             [4 5 6]]
+            we generate mask with the same shape as x, which is 2*3. The value of mask is
+            sampled from a Bernoulli distribution randomly. For example, we may get such mask:
+            [[0 1 0]
+             [1 0 1]]
+            So the output is obtained from elementwise multiply of x and mask:
+            [[0 2 0]
+             [4 0 6]]
+            Using default setting, i.e. ``mode='upscale_in_train'`` ,
+            if in training phase, the final upscale output is:
+            [[0 4 0 ]
+             [8 0 12]]
+            if in test phase, the output is the same as input:
+            [[1 2 3]
+             [4 5 6]]
+            we can also set ``mode='downscale_in_infer'`` , then
+            if in training phase, the final output is:
+            [[0 2 0]
+             [4 0 6]]
+            if in test phase, the scale output is:
+            [[0.5 1.  1.5]
+             [2.  2.5 3. ]]
+
+        2. When ``axis!=None`` , this is useful for dropping whole channels from an image or sequence.
+            Let's see the simple case when x is a 2d tensor with shape 2*3 again:
+            [[1 2 3]
+             [4 5 6]]
+            (1) If ``axis=0`` , this means the dropout is only performed in axis `0` .
+                we generate mask with the shape 2*1. Only in axis `0` the value is randomly selected.
+                For example, we may get such mask:
+                [[1]
+                 [0]]
+                The output is obtained from elementwise multiply of x and mask. Doing that the mask will be
+                broadcast from 2*1 to 2*3:
+                [[1 1 1]
+                 [0 0 0]]
+                and the result after elementwise multiply is:
+                [[1 2 3]
+                 [0 0 0]]
+                then we can do upscale or downscale according to the setting of other arguments.
+            (2) If ``axis=1`` , this means the dropout is only performed in axis `1` .
+                we generate mask with the shape 1*3. Only in axis `1` the value is randomly selected.
+                For example, we may get such mask:
+                [[1 0 1]]
+                Doing elementwise multiply the mask will be broadcast from 1*3 to 2*3:
+                [[1 0 1]
+                 [1 0 1]]
+                and the result after elementwise multiply is:
+                [[1 0 3]
+                 [4 0 6]]
+            (3) What about ``axis=[0, 1]`` ? This means the dropout is performed in all axes of x,
+                which is the same case as default setting ``axis=None`` .
+            (4) You may note that logically `axis=None` means the dropout is performed in none axis of x,
+                We generate mask with the shape 1*1. Whole input is randomly selected or dropped.
+                For example, we may get such mask:
+                [[0]]
+                Doing elementwise multiply the mask will be broadcast from 1*1 to 2*3:
+                [[0 0 0]
+                 [0 0 0]]
+                and the result after elementwise multiply is:
+                [[0 0 0]
+                 [0 0 0]]
+                Actually this is not what we want because all elements may set to zero~
+            When x is a 4d tensor with shape `NCHW`, we can set ``axis=[0,1]`` and the dropout will be performed
+            in channel `N` and `C`, `H` and `W` is tied, i.e.
+            paddle.nn.dropout(x, p, axis=[0,1])
+            Please refer to ``paddle.nn.functional.dropout2d`` for more details.
+            Similarly, when x is a 5d tensor with shape `NCDHW`, we can set ``axis=[0,1]`` to perform
+            dropout3d. Please refer to ``paddle.nn.functional.dropout3d`` for more details.
+
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.array([[1,2,3], [4,5,6]]).astype('float32')
+            x = paddle.to_tensor(x)
+            y_train = paddle.nn.functional.dropout(x, 0.5)
+            y_test = paddle.nn.functional.dropout(x, 0.5, training=False) 
+            y_0 = paddle.nn.functional.dropout(x, axis=0)
+            y_1 = paddle.nn.functional.dropout(x, axis=1)
+            y_01 = paddle.nn.functional.dropout(x, axis=[0,1])
+            print(x.numpy())
+            print(y_train.numpy())
+            print(y_test.numpy())
+            print(y_0.numpy())
+            print(y_1.numpy())
+            print(y_01.numpy())
+
+    """
+    if not isinstance(p, (float, int)):
+        raise TypeError("p argument should be a number")
+    if p < 0 or p > 1:
+        raise ValueError("p argument should between 0 and 1")
+    if mode not in ('downscale_in_infer', 'upscale_in_train'):
+        raise ValueError(
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+    if axis and not isinstance(axis, (int, list)):
+        raise TypeError("datatype of axis argument should be int or list")
+
+    if axis == None:  # commonly used dropout
+        seed = None
+        mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+        def get_attrs(prog, dropout_prob, is_test, seed):
+            if (seed is None or seed == 0) and prog.random_seed != 0:
+                seed = prog.random_seed
+            attrs = {
+                'dropout_prob': dropout_prob,
+                'is_test': is_test,
+                'fix_seed': seed is not None,
+                'seed': seed if seed is not None else 0,
+                'dropout_implementation': mode,
+            }
+            return attrs
+
+        if in_dygraph_mode():
+            if default_main_program().random_seed != 0:
+                seed = default_main_program().random_seed
+            out, mask = core.ops.dropout(
+                x, 'dropout_prob', p, 'is_test', not training, 'fix_seed',
+                seed is not None, 'seed', seed
+                if seed is not None else 0, 'dropout_implementation', mode)
+            return out
+
+        helper = LayerHelper('dropout', **locals())
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'dropout')
+
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+        mask = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+
+        attrs = get_attrs(helper.main_program, p, not training, seed)
+
+        helper.append_op(
+            type='dropout',
+            inputs={'X': [x]},
+            outputs={'Out': [out],
+                     'Mask': [mask]},
+            attrs=attrs)
+        return out
+    else:  #sometimes called dropout_nd #TODO: optimize with c++
+        if not in_dygraph_mode():
+            check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'dropout')
+        dtype = x.dtype
+        keep_prob = 1 - p
+        if training:
+            if p == 1.:
+                return layers.scale(x, scale=0.)
+
+            scale_input = layers.scale(
+                x, scale=1 / keep_prob) if mode == 'upscale_in_train' else x
+
+            #get mask shape
+            input_shape = x.shape
+            drop_axes = [axis] if isinstance(axis, int) else axis
+            if max(drop_axes) > len(input_shape) - 1:
+                raise ValueError("axis value should less than dimensions of x:{}, but get drop_axes value:{} " \
+                                 .format(len(input_shape), max(drop_axes)))
+            if len(drop_axes) > len(input_shape):
+                raise ValueError(
+                    "length of axis should not greater than dimensions of x:{}, but get length of drop axes: {}".
+                    format(len(input_shape), len(drop_axes)))
+            mask_shape = [1] * len(input_shape)
+            for i in drop_axes:
+                mask_shape[i] = input_shape[i]
+
+            #get mask
+            random_tensor = layers.uniform_random(
+                mask_shape, dtype='float32', min=0., max=1.0)
+            p = layers.fill_constant(shape=[1], dtype='float32', value=p)
+            keep_mask = layers.greater_equal(random_tensor, p)
+
+            scale_input = layers.cast(scale_input, dtype)
+            keep_mask = layers.cast(keep_mask, dtype)
+            ret = paddle.multiply(scale_input, keep_mask, name=name)
+            return ret
+        else:  # test
+            ret = layers.scale(
+                x, scale=keep_prob) if mode == 'downscale_in_infer' else x
+            return ret
+
+
+def dropout2d(x, p=0.5, training=True, data_format='NCHW', name=None):
+    """
+    Randomly zero out entire channels (in the batched input 4d tensor with the shape `NCHW` ,
+    a channel is a 2D feature map with the shape `HW` ). Each channel will be zeroed out independently
+    on every forward call with probability `p` using samples from a Bernoulli distribution.
+
+    See ``paddle.nn.functional.dropout`` for more details.
+
+    Args:
+        x (Tensor):  The input is 4-D Tensor with shape [N, C, H, W] or [N, H, W, C].
+                     The data type is float32 or float64.
+        p (float): Probability of setting units to zero. Default 0.5.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+                                     will be consistent with that of the input. An optional string from:
+                                    `NCHW` , `NHWC` . The default is `NCHW` . When it is `NCHW` , the data is
+                                    stored in the order of: [batch_size, input_channels, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout2d, has same shape and data type as `x` .
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.random(size=(2, 3, 4, 5)).astype('float32')
+            x = paddle.to_tensor(x)
+            y_train = paddle.nn.functional.dropout2d(x)  #train
+            y_test = paddle.nn.functional.dropout2d(x, training=False) #test
+            for i in range(2):
+                for j in range(3):
+                    print(x.numpy()[i,j,:,:])
+                    print(y_train.numpy()[i,j,:,:]) # may all 0
+                    print(y_test.numpy()[i,j,:,:])
+    """
+    input_shape = x.shape
+    if len(input_shape) != 4:
+        raise ValueError("dimensions of x should be 4, but received {} != 4"\
+        .format(len(input_shape)))
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    return dropout(
+        x,
+        p=p,
+        axis=[0, 1] if data_format == 'NCHW' else [0, 3],
+        training=training,
+        mode="upscale_in_train",
+        name=name)
+
+
+def dropout3d(x, p=0.5, training=True, data_format='NCDHW', name=None):
+    """
+    Randomly zero out entire channels (in the batched input 5d tensor with the shape `NCDHW` ,
+    a channel is a 3D feature map with the shape `DHW` ). Each channel will be zeroed out independently
+    on every forward call with probability `p` using samples from a Bernoulli distribution.
+
+    See ``paddle.nn.functional.dropout`` for more details.
+
+    Args:
+        x (Tensor):  The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C].
+                     The data type is float32 or float64.
+        p (float): Probability of setting units to zero. Default 0.5.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+                                     will be consistent with that of the input. An optional string from:
+                                    ``NCDHW``, ``NDHWC``. The default is ``NCDHW`` . When it is ``NCDHW`` , the data is
+                                    stored in the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout3d, has same shape and data type with `x` .
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.random(size=(2, 3, 4, 5, 6)).astype('float32')
+            x = paddle.to_tensor(x)
+            y_train = paddle.nn.functional.dropout3d(x)  #train
+            y_test = paddle.nn.functional.dropout3d(x, training=False) #test
+            print(x.numpy()[0,0,:,:,:])
+            print(y_train.numpy()[0,0,:,:,:]) # may all 0
+            print(y_test.numpy()[0,0,:,:,:])
+    """
+
+    input_shape = x.shape
+    if len(input_shape) != 5:
+        raise ValueError("dimensions of x should be 5, but received {} != 5" \
+        .format(len(input_shape)))
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    return dropout(
+        x,
+        p=p,
+        axis=[0, 1] if data_format == 'NCDHW' else [0, 4],
+        training=training,
+        mode="upscale_in_train",
+        name=name)
+
+
+def alpha_dropout(x, p=0.5, training=True, name=None):
+    """
+    Alpha Dropout is a type of Dropout that maintains the self-normalizing property.
+    For an input with zero mean and unit standard deviation, the output of Alpha Dropout
+    maintains the original mean and standard deviation of the input.
+    Alpha Dropout fits well to SELU activate function by randomly setting activations to the negative saturation value.
+
+    Args:
+        x (Tensor): The input tensor. The data type is float32 or float64.
+        p (float | int): Probability of setting units to zero. Default 0.5.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout, has same shape and data type as `x`.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.array([[-1, 1], [-1, 1]]).astype('float32')
+            x = paddle.to_tensor(x)
+            y_train = paddle.nn.functional.alpha_dropout(x, 0.5)
+            y_test = paddle.nn.functional.alpha_dropout(x, 0.5, training=False)
+            print(x.numpy())
+            print(y_train.numpy())
+            # [[-0.10721093, 1.6655989 ], [-0.7791938, -0.7791938]] (randomly)
+            print(y_test.numpy())
+    """
+    if not isinstance(p, (float, int)):
+        raise TypeError("p argument should be a float or int")
+    if p < 0 or p > 1:
+        raise ValueError("p argument should between 0 and 1")
+
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'alpha_dropout')
+
+    if training:
+        #get transformation params
+        alpha = 1.6732632423543772848170429916717
+        scale = 1.0507009873554804934193349852946
+        alpha_p = -alpha * scale
+        a = ((1 - p) * (1 + p * alpha_p**2))**-0.5
+        b = -a * alpha_p * p
+
+        dtype = x.dtype
+        input_shape = x.shape
+
+        #get mask
+        random_tensor = layers.uniform_random(
+            input_shape, dtype='float32', min=0., max=1.0)
+        p = layers.fill_constant(shape=[1], dtype='float32', value=p)
+        keep_mask = layers.greater_equal(random_tensor, p)
+        keep_mask = layers.cast(keep_mask, dtype)
+        drop_mask = layers.elementwise_sub(
+            layers.fill_constant(
+                shape=input_shape, dtype=dtype, value=1.),
+            keep_mask)
+
+        #apply mask
+        b = layers.fill_constant(shape=[1], dtype=dtype, value=b)
+        y = layers.elementwise_add(
+            paddle.multiply(x, keep_mask),
+            layers.scale(
+                drop_mask, scale=alpha_p))
+        res = layers.elementwise_add(layers.scale(y, scale=a), b, name=name)
+        return res
+    else:  # test
+        return x
+
+
+def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
+    """
+    Pad tensor according to 'pad' and 'mode'.
+    If mode is 'reflect', pad[0] and pad[1] must be no greater
+    than width-1. The height and depth dimension has the same condition.
+
+    Parameters:
+        x (Tensor): The input tensor with data type float32/double/int32/int64_t.
+        pad (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. 1. If input dimension is 3, then the pad has the form (pad_left,
+            pad_right). 2. If the input dimension is 4, then the pad has the form (pad_left, pad_right, 
+            pad_top, pad_bottom). 3. If the input dimension is 5, then the pad has the form 
+            (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
+            
+        mode (str): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'.
+            When in 'constant' mode, this op uses a constant value to pad the input tensor.
+            When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
+            When in 'replicate' mode, uses input boundaries to pad the input tensor.
+            When in 'circular' mode, uses circular input to pad the input tensor.
+            Default is 'constant'
+        value (float32): The value to fill the padded areas in 'constant' mode . Default is 0.0
+        data_format (str): An string from: "NCL", "NLC", NHWC", "NCHW", "NCDHW", "NDHWC". Specify the data format of
+           the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+                    
+    Returns: a Tensor padded according to pad and mode and data type is same as input.
+    Return Type: Tensor
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[[1., 2., 3.],
+                    [4., 5., 6.]]]]]
+
+            Case 0:
+                pad = [2, 2, 1, 1, 0, 0],
+                mode = 'constant'
+                value = 0
+                Out = [[[[[0. 0. 0. 0. 0. 0. 0.]
+                          [0. 0. 1. 2. 3. 0. 0.]
+                          [0. 0. 4. 5. 6. 0. 0.]
+                          [0. 0. 0. 0. 0. 0. 0.]]]]]
+
+            Case 1:
+                pad = [2, 2, 1, 1, 0, 0],
+                mode = 'reflect'
+                Out = [[[[[6. 5. 4. 5. 6. 5. 4.]
+                          [3. 2. 1. 2. 3. 2. 1.]
+                          [6. 5. 4. 5. 6. 5. 4.]
+                          [3. 2. 1. 2. 3. 2. 1.]]]]]
+
+            Case 2:
+                pad = [2, 2, 1, 1, 0, 0],
+                mode = 'replicate'
+                Out = [[[[[1. 1. 1. 2. 3. 3. 3.]
+                          [1. 1. 1. 2. 3. 3. 3.]
+                          [4. 4. 4. 5. 6. 6. 6.]
+                          [4. 4. 4. 5. 6. 6. 6.]]]]]
+
+            Case 3:
+                pad = [2, 2, 1, 1, 0, 0],
+                mode = 'circular'
+                Out = [[[[[5. 6. 4. 5. 6. 4. 5.]
+                          [2. 3. 1. 2. 3. 1. 2.]
+                          [5. 6. 4. 5. 6. 4. 5.]
+                          [2. 3. 1. 2. 3. 1. 2.]]]]]
+
+    Code Examples:
+        .. code-block:: python
+            import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            
+            paddle.disable_static()
+            
+            # example 1
+            x_shape = (1, 1, 3)
+            x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
+            tensor_x = paddle.to_tensor(x)
+            y = F.pad(tensor_x, pad=[2, 3], value=1, mode='constant')
+            print(y.numpy())
+            # [[[1. 1. 1. 2. 3. 1. 1. 1.]]]
+            
+            # example 2
+            x_shape = (1, 1, 2, 3)
+            x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
+            tensor_x = paddle.to_tensor(x)
+            y = F.pad(tensor_x, pad=[1, 2, 1, 1], value=1, mode='circular')
+            print(y.numpy())
+            # [[[[6. 4. 5. 6. 4. 5.]
+            #    [3. 1. 2. 3. 1. 2.]
+            #    [6. 4. 5. 6. 4. 5.]
+            #    [3. 1. 2. 3. 1. 2.]]]]
+    """
+    assert mode in ['reflect', 'replicate', 'constant', 'circular'], \
+            "mode should be one of constant, reflect, replicate, circular, but got {}.".format(mode)
+
+    data_format = data_format.upper()
+    assert data_format in ["NCL", "NCHW", "NCDHW", "NLC", "NHWC", "NDHWC"], \
+        "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], " \
+        "but got {}".format(data_format)
+
+    x_dim = len(x.shape)
+
+    assert x_dim in [
+        3, 4, 5
+    ], "input tesor dimension must be in [3, 4, 5] but got {}".format(x_dim)
+
+    supported_format_map = {
+        3: ["NCL", "NLC"],
+        4: ["NCHW", "NHWC"],
+        5: ["NCDHW", "NDHWC"],
+    }
+    assert data_format in supported_format_map[x_dim], \
+    "input tensor dimension is {}, it's data format should be in {} but got {}".format(
+        x_dim, supported_format_map[x_dim], data_format)
+
+    unsqueezed_dim = []
+
+    if isinstance(pad, Variable):
+        if data_format in ["NCL", "NCHW", "NCDHW"]:
+            data_format = "NCDHW"
+            if x_dim == 3:
+                pad = concat([zeros((4, ), dtype="int32"), pad], axis=0)
+                unsqueezed_dim = [3, 4]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+            elif x_dim == 4:
+                pad = concat([pad, zeros((2, ), dtype="int32")], axis=0)
+                unsqueezed_dim = [2]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+        elif data_format in ["NLC", "NHWC", "NDHWC"]:
+            data_format = "NDHWC"
+            if x_dim == 3:
+                pad = concat([zeros((4, ), dtype="int32"), pad], axis=0)
+                unsqueezed_dim = [2, 3]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+            elif x_dim == 4:
+                pad = concat([pad, zeros((2, ), dtype="int32")], axis=0)
+                unsqueezed_dim = [1]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+    else:
+        if data_format in ["NCL", "NCHW", "NCDHW"]:
+            data_format = "NCDHW"
+            if x_dim == 3:
+                pad = [0, 0, 0, 0] + pad
+                unsqueezed_dim = [3, 4]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+            elif x_dim == 4:
+                pad = pad + [0, 0]
+                unsqueezed_dim = [2]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+        elif data_format in ["NLC", "NHWC", "NDHWC"]:
+            data_format = "NDHWC"
+            if x_dim == 3:
+                pad = [0, 0, 0, 0] + pad
+                unsqueezed_dim = [2, 3]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+            elif x_dim == 4:
+                pad = pad + [0, 0]
+                unsqueezed_dim = [1]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+
+    if in_dygraph_mode():
+        if isinstance(pad, Variable):
+            pad = pad.numpy()
+        out = core.ops.pad3d(x, "paddings", pad, "mode", mode, "value", value,
+                             "data_format", data_format, "name", name)
+    else:
+        attrs = {'mode': mode, 'value': value, 'data_format': data_format}
+        inputs = {'X': [x]}
+        if isinstance(pad, Variable):
+            inputs['Paddings'] = [pad]
+            attrs['paddings'] = []
+        else:
+            attrs['paddings'] = pad
+
+        helper = LayerHelper('pad3d', **locals())
+
+        dtype = helper.input_dtype(input_param_name='input')
+        out = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='pad3d', inputs=inputs, outputs={"Out": out}, attrs=attrs)
+
+    if len(unsqueezed_dim) != 0:
+        out = squeeze(out, axes=unsqueezed_dim)
+
+    return out
+
+
+def cosine_similarity(x1, x2, axis=1, eps=1e-8):
+    """
+    Compute cosine similarity between x1 and x2 along axis.
+
+    Parameters:
+        x1 (Tensor): First input. float32/double.
+        x2 (Tensor): Second input. float32/double.
+        axis (int): Dimension of vectors to compute cosine similarity. Default is 1.
+        eps(float): Small value to avoid division by zero. Default is 1e-8.
+                    
+    Returns: a Tensor representing cosine similarity between x1 and x2 along axis.
+    Return Type: Tensor
+
+    Examples:
+        .. code-block:: text
+            Case 0:
+                x1 = [[0.8024077  0.9927354  0.27238318 0.8344984 ]
+                     [0.48949873 0.5797396  0.65444374 0.66510963]
+                     [0.1031398  0.9614342  0.08365563 0.6796464 ]
+                     [0.10760343 0.7461209  0.7726148  0.5801006 ]]
+                x2 = [[0.62913156 0.1536727  0.9847992  0.04591406]
+                     [0.9098952  0.15715368 0.8671125  0.3156102 ]
+                     [0.4427798  0.54136837 0.5276275  0.32394758]
+                     [0.3769419  0.8535014  0.48041078 0.9256797 ]]
+                axis = 1
+                eps = 1e-8
+                Out: [0.5275037  0.8368967  0.75037485 0.9245899]
+
+    Code Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            np.random.seed(0)
+            x1 = np.random.rand(2,3)
+            x2 = np.random.rand(2,3)
+            x1 = paddle.to_tensor(x1)
+            x2 = paddle.to_tensor(x2)
+            result = paddle.nn.functional.cosine_similarity(x1, x2, axis=0)
+            print(result.numpy())
+            # [0.99806249 0.9817672  0.94987036]
+            
+    """
+    w12 = sum(elementwise_mul(x1, x2), axis=axis)
+    w1 = sum(elementwise_mul(x1, x1), axis=axis)
+    w2 = sum(elementwise_mul(x2, x2), axis=axis)
+    n12 = sqrt(clip(w1 * w2, min=eps * eps))
+    cos_sim = w12 / n12
+    return cos_sim
+
+
+def linear(x, weight, bias=None, name=None):
+    """
+
+    Fully-connected linear transformation op
+
+    .. math::
+
+        Out = {XW + b}
+
+    where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively.
+
+    The linear op multiplies input tensor with weight matrix and
+    produces an output Tensor of shape [N, *, output_dim], 
+    where N is batch size and `*` means any number of additional dimensions and output_dim is the last dim of ``weight``.
+    If ``bias`` is not None, a bias will be added to the output.
+
+    Args:
+        x(Tensor): Input tensor, its data type is float16, float32 or float64
+        weight(Tensor): Weight tensor, its data type is float16, float32 or float64
+        bias(Tensor|None, optional): Bias tensor, its data type is float16, float32 or float64. If it is set to None, no bias will be added to the output units.
+        name(str|None, optional): For detailed information, please refer to :ref:`api_guide_Name`. Default: None.
+
+    Returns:
+        Output tensor
+
+    Examples:
+        .. code-block:: python
+          
+          import numpy as np
+          import paddle
+          import paddle.nn.functional as F
+          
+          input = np.ones((3,1,2), dtype=np.float32)
+          weight = np.ones((2,2), dtype=np.float32)
+          bias = np.ones((2), dtype=np.float32)
+          place = paddle.CPUPlace()
+          paddle.disable_static(place)
+          input = paddle.to_tensor(input)
+          weight = paddle.to_tensor(weight)
+          bias = paddle.to_tensor(bias)
+          out = F.linear(input, weight, bias)
+          print(out) #[3 3 3 3 3 3]
+    
+    """
+    if in_dygraph_mode():
+        pre_bias = _varbase_creator(dtype=x.dtype)
+        core.ops.matmul(x, weight, pre_bias, 'transpose_X', False,
+                        'transpose_Y', False, "alpha", 1)
+        return dygraph_utils._append_bias_in_dygraph(
+            pre_bias, bias, axis=len(x.shape) - 1)
+    else:
+        helper = LayerHelper('linear', **locals())
+        dtype = x.dtype
+
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'linear')
+        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear')
+
+        inputs = {'X': [x], 'Y': [weight]}
+        attrs = {
+            'transpose_X': False,
+            'transpose_Y': False,
+            'alpha': 1,
+        }
+        tmp = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='matmul', inputs=inputs, outputs={'Out': tmp}, attrs=attrs)
+        if bias is not None:
+            res = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [tmp],
+                        'Y': [bias]},
+                outputs={'Out': [res]},
+                attrs={'axis': len(x.shape) - 1})
+        else:
+            res = tmp
+        return res
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 2a519718258856fe1f4462422a36dccae7066ad1..f80f200c7163836252faa4b1c932178f6bab0dff 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -13,15 +13,24 @@
 # limitations under the License.
 from __future__ import print_function
 
-__all__ = ['conv2d', 'conv2d_transpose', 'conv3d', 'conv3d_transpose']
+__all__ = [
+    'conv1d',
+    'conv_transpose1d',
+    'conv2d',
+    'conv_transpose2d',
+    'conv3d',
+    'conv_transpose3d',
+]
 
 import numpy as np
+from ...device import get_cudnn_version
 from ...fluid.framework import Variable, in_dygraph_mode
 from ...fluid import core, dygraph_utils
 from ...fluid.layers import nn, utils
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.param_attr import ParamAttr
 from ...fluid.layer_helper import LayerHelper
+from .common import pad2d
 
 
 def _is_list_or_tuple(input):
@@ -87,20 +96,242 @@ def _update_padding_nd(padding, channel_last, num_dims):
     return padding, padding_algorithm
 
 
-def conv2d(input,
+def conv1d(x,
            weight,
            bias=None,
+           stride=1,
            padding=0,
+           dilation=1,
+           groups=1,
+           data_format='NCL',
+           name=None):
+    """
+    The convolution1D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCL format, where N is batch size, C is the number of
+    channels, L is the length of the feature.
+    Filter is in MCK format, where M is the number of output image channels,
+    C is the number of input image channels, K is the size of the kernel.
+    If the groups is greater than 1, C will equal the number of input image
+    channels divided by the groups. If bias attribution and activation type
+    are provided, bias is added to the output of the convolution, and the
+    corresponding activation function is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a tensor with NCL format.
+    * :math:`W`: Kernel value, a tensor with MCK format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, L_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, L_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, L_{out})`
+
+        Where
+
+        .. math::
+
+            L_{out}&= \\frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1
+
+    Args:
+        x (Tensor): The input is 3-D Tensor with shape [N, C, L], the data type 
+            of input is float16 or float32 or float64.
+        weight (Tensor): The convolution kernel with shape [M, C/g, K], where M is
+            the number of output channels, g is the number of groups, K is the kernel's size. 
+        bias (Tensor, optional): The bias with shape [M,]. Default: None.
+        stride (int or tuple, optional): The stride size. If stride is a tuple, it must
+            contain one integers, (stride_size). Default: 1.
+        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+            1. a string in ['valid', 'same'].
+            2. an int, which means the feature map is zero paded by size of `padding` on both sides.
+            3. a list[int] or tuple[int] whose length is 1, which means the feature map is zero paded by size of `padding[0]` on both sides.
+            4. a list[int] or tuple[int] whose length is 2. It has the form  [pad_before, pad_after].
+            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
+            The default value is 0.
+        dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain one integer, (dilation_size). Default: 1.
+        groups (int, optional): The groups number of the conv1d function. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: 1.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
+            The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
+            `[batch_size, input_channels, feature_length]`.
+        name(str, optional): For detailed information, please refer 
+           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           None by default.
+
+    Returns:
+        A tensor representing the conv1d, whose data type is the 
+        same with input.
+
+    Raises:
+        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If `data_format` is not "NCL" or "NLC".
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+            or the element corresponding to the input's channel is not 0.
+        ShapeError: If the input is not 3-D Tensor.
+        ShapeError: If the input's dimension size and filter's dimension size not equal.
+        ShapeError: If the dimension size of input minus the size of `stride` is not 1.
+        ShapeError: If the number of input channels is not equal to filter's channels * groups.
+        ShapeError: If the number of output channels is not be divided by groups.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          x = np.array([[[4, 8, 1, 9],
+            [7, 2, 0, 9],
+            [6, 9, 2, 6]]]).astype(np.float32)
+          w=np.array(
+          [[[9, 3, 4],
+            [0, 0, 7],
+            [2, 5, 6]],
+           [[0, 3, 4],
+            [2, 9, 7],
+            [5, 6, 8]]]).astype(np.float32)
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv1d(x_var, w_var)
+          y_np = y_var.numpy()
+          print(y_np)
+          
+          # [[[133. 238.]
+          #   [160. 211.]]]
+    """
+    cudnn_version = get_cudnn_version()
+    if cudnn_version is not None:
+        use_cudnn = True
+    else:
+        use_cudnn = False
+
+    if data_format not in ["NCL", "NLC"]:
+        raise ValueError("Attr(data_format) should be 'NCL' or 'NLC'. "
+                         "Received Attr(data_format): {}.".format(data_format))
+
+    channel_last = (data_format == "NHWC")
+    channel_dim = -1 if channel_last else 1
+    conv2d_data_format = "NHWC" if channel_last else "NCHW"
+    num_channels = x.shape[channel_dim]
+    num_filters = weight.shape[0]
+    if num_channels < 0:
+        raise ValueError("The channel dimmention of the input({}) "
+                         "should be defined. Received: {}.".format(
+                             x.shape, num_channels))
+    if num_channels % groups != 0:
+        raise ValueError(
+            "the channel of input must be divisible by groups,"
+            "received: the channel of input is {}, the shape of input is {}"
+            ", the groups is {}".format(num_channels, x.shape, groups))
+    if num_filters % groups != 0:
+        raise ValueError(
+            "the number of filters must be divisible by groups,"
+            "received: the number of filters is {}, the shape of weight is {}"
+            ", the groups is {}".format(num_filters, weight.shape, groups))
+
+    # update attrs
+    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
+    if len(padding) == 2:
+        padding = padding + [0] * 2
+    elif len(padding) == 1:
+        padding = padding + [0]
+    else:
+        raise ValueError(
+            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            format(padding))
+
+    stride = utils.convert_to_list(stride, 1, 'stride') + [1]
+    dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
+
+    l_type = "conv2d"
+    if (num_channels == groups and num_filters % num_channels == 0 and
+            not use_cudnn):
+        l_type = 'depthwise_conv2d'
+        use_cudnn = False
+
+    inputs = {'Input': [x], 'Filter': [weight]}
+    attrs = {
+        'strides': stride,
+        'paddings': padding,
+        'dilations': dilation,
+        'groups': groups,
+        'use_cudnn': use_cudnn,
+        'use_mkldnn': False,
+        'fuse_relu_before_depthwise_conv': False,
+        "padding_algorithm": padding_algorithm,
+        "data_format": conv2d_data_format
+    }
+    squeeze_aixs = -2 if channel_last else -1
+    x = nn.unsqueeze(input=x, axes=[squeeze_aixs])
+    weight = nn.unsqueeze(input=weight, axes=[-1])
+    if in_dygraph_mode():
+        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
+                 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
+                 'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
+                 padding_algorithm, "data_format", conv2d_data_format)
+        out = getattr(core.ops, l_type)(x, weight, *attrs)
+        if bias is not None:
+            out = nn.elementwise_add(out, bias, axis=channel_dim)
+    else:
+        inputs = {'Input': [x], 'Filter': [weight]}
+        attrs = {
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': False,
+            'fuse_relu_before_depthwise_conv': False,
+            "padding_algorithm": padding_algorithm,
+            "data_format": conv2d_data_format
+        }
+        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                                 'conv2d')
+        helper = LayerHelper(l_type, **locals())
+        dtype = helper.input_dtype()
+        out = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Output": [out]}
+        helper.append_op(
+            type=l_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        if bias is not None:
+            out = nn.elementwise_add(out, bias, axis=channel_dim)
+    out = nn.squeeze(input=out, axes=[squeeze_aixs])
+    return out
+
+
+def conv2d(x,
+           weight,
+           bias=None,
            stride=1,
+           padding=0,
            dilation=1,
            groups=1,
-           use_cudnn=True,
-           act=None,
            data_format="NCHW",
            name=None):
     """
-	:alias_main: paddle.nn.functional.conv2d
-	:alias: paddle.nn.functional.conv2d,paddle.nn.functional.conv.conv2d
 
     The convolution2D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input and
@@ -152,12 +383,15 @@ def conv2d(input,
             W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
-        input (Variable): The input is 4-D Tensor with shape [N, C, H, W], the data type 
+        x (Tensor): The input is 4-D Tensor with shape [N, C, H, W], the data type 
             of input is float16 or float32 or float64.
-        weight (Variable): The convolution kernel with shape [M, C/g, kH, kW], where M is
+        weight (Tensor): The convolution kernel with shape [M, C/g, kH, kW], where M is
             the number of output channels, g is the number of groups, kH is the filter's
             height, kW is the filter's width. 
-        bias (Variable, optional): The bias with shape [M,].
+        bias (Tensor, optional): The bias with shape [M,].
+        stride (int|tuple): The stride size. It means the stride in convolution. 
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
             on both sides for each dimension.If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
@@ -168,9 +402,6 @@ def conv2d(input,
             when `data_format` is `"NHWC"`, `pool_padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        stride (int|tuple): The stride size. It means the stride in convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
-            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel
             points. If dilation is a tuple, it must contain two integers, (dilation_height, 
             dilation_width). Otherwise, dilation_height = dilation_width = dilation. 
@@ -180,10 +411,6 @@ def conv2d(input,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -193,13 +420,9 @@ def conv2d(input,
            None by default.
 
     Returns:
-        A Variable holding Tensor representing the conv2d, whose data type is the 
-        same with input. If act is None, the tensor variable storing the convolution 
-        result, and if act is not None, the tensor variable storing convolution 
-        and non-linearity activation result.
+        A Tensor representing the conv2d result, whose data type is the same with input. 
 
     Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
         ValueError: If `data_format` is not "NCHW" or "NHWC".
         ValueError: If the channel dimmention of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
@@ -214,62 +437,65 @@ def conv2d(input,
     Examples:
         .. code-block:: python
 
-          from paddle import fluid
+          import paddle
           import paddle.nn.functional as F
-          import paddle.fluid.dygraph as dg
           import numpy as np
 
           x = np.random.randn(2, 3, 8, 8).astype(np.float32)
           w = np.random.randn(6, 3, 3, 3).astype(np.float32)
 
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              w_var = dg.to_variable(w)
-              y_var = F.conv2d(x_var, w_var, act="relu")
-              y_np = y_var.numpy()
+          paddle.disable_static()
+
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv2d(x_var, w_var)
+          y_np = y_var.numpy()
+
           print(y_np.shape)
 
           # (2, 6, 6, 6)
     """
     # entry checks
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. "
-                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
                          "Received Attr(data_format): {}.".format(data_format))
 
     channel_last = (data_format == "NHWC")
     channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
+    num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError("The channel dimmention of the input({}) "
                          "should be defined. Received: {}.".format(
-                             input.shape, num_channels))
+                             x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, input.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups))
     if num_filters % groups != 0:
         raise ValueError(
             "the number of filters must be divisible by groups,"
             "received: the number of filters is {}, the shape of weight is {}"
             ", the groups is {}".format(num_filters, weight.shape, groups))
 
+    # use_cudnn = True if core.is_compiled_with_cuda() else False
+    cudnn_version = get_cudnn_version()
+
+    use_cudnn = True if (core.is_compiled_with_cuda() and
+                         cudnn_version is not None) else False
+
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
     stride = utils.convert_to_list(stride, 2, 'stride')
     dilation = utils.convert_to_list(dilation, 2, 'dilation')
 
     l_type = "conv2d"
-    if (num_channels == groups and num_filters % num_channels == 0 and
-            not use_cudnn):
+    if (num_channels == groups and num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
+        use_cudnn = False
 
-    inputs = {'Input': [input], 'Filter': [weight]}
+    inputs = {'Input': [x], 'Filter': [weight]}
     attrs = {
         'strides': stride,
         'paddings': padding,
@@ -287,15 +513,13 @@ def conv2d(input,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
                  'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
                  padding_algorithm, "data_format", data_format)
-        pre_bias = getattr(core.ops, l_type)(input, weight, *attrs)
+        pre_bias = getattr(core.ops, l_type)(x, weight, *attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
+            out = pre_bias
     else:
-        inputs = {'Input': [input], 'Filter': [weight]}
+        inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
             'strides': stride,
             'paddings': padding,
@@ -307,8 +531,8 @@ def conv2d(input,
             "padding_algorithm": padding_algorithm,
             "data_format": data_format
         }
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'conv2d')
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'conv2d')
         helper = LayerHelper(l_type, **locals())
         dtype = helper.input_dtype()
         pre_bias = helper.create_variable_for_type_inference(dtype)
@@ -316,28 +540,279 @@ def conv2d(input,
         helper.append_op(
             type=l_type, inputs=inputs, outputs=outputs, attrs=attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
+            out = pre_bias
+
     return out
 
 
-def conv2d_transpose(input,
+def conv_transpose1d(x,
                      weight,
                      bias=None,
-                     output_size=None,
-                     padding=0,
                      stride=1,
+                     padding=0,
+                     output_padding=0,
+                     groups=1,
                      dilation=1,
+                     output_size=None,
+                     data_format="NCL",
+                     name=None):
+    """
+    The 1-D convolution transpose layer calculates the output based on the input,
+    filter, and dilation, stride, padding. Input(Input) and output(Output)
+    are in 'NCL' format or 'NLC' where N is batch size, C is the number of channels,
+    L is the length of the feature. The details of convolution transpose
+    layer, please refer to the following explanation and references
+    `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a 3-D Tensor with 'NCL' format or 'NLC' format.
+    * :math:`W`: Filter value, a 3-D Tensor with 'MCK' format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, a 3-D Tensor with data format 'NCL' or 'NLC', the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, L_{in})`
+
+          Filter shape: :math:`(C_{in}, C_{out}, L_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, L_{out})`
+
+        Where
+
+        .. math::
+
+           L^\prime_{out} &= (L_{in} - 1) * stride - pad_top - pad_bottom + dilation * (L_f - 1) + 1 + output_padding \\\\
+           L_{out} &\in [ L^\prime_{out}, L^\prime_{out} + stride ]
+
+    Note:
+          The conv1d_transpose can be seen as the backward of the conv1d. For conv1d,
+          when stride > 1, conv1d maps multiple input shape to the same output shape,
+          so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
+          If output_size is None, :math:`L_{out} = L^\prime_{out}`;
+          else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
+          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+
+    Args:
+        x(Tensor): 3-D tensor with [N, C, L] or [N, L, C] format,
+                         its data type is float32 or float64.
+        weight(Tensor): The convolution kernel, a Tensor with shape [C, M/g, K],
+            where M is the number of output channels(filters), g is the number of groups,
+            K is the size of the kernel.
+        bias(Tensor, optional): The bias, a Tensor with shape [M, ].
+        stride(int|tuple|list, optional): The stride size. It means the stride in transposed convolution.
+            If stride is a tuple, it must contain one integer, `(stride_size)`.
+            Default: stride = 1.
+        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
+             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
+             string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
+             If `padding` is a tuple or list, it could be in two forms:
+             `[pad]` or `[pad_left, pad_right]`. Default: padding = 0.
+        output_padding(int|list|tuple, optional): The count of zeros to be added to tail of each dimension.
+             If it is a tuple, it must contain one integer. Default: 0.
+        groups(int, optional): The groups number of the conv1d transpose function. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups = 1.
+        dilation(int|tuple|list, optional): The dilation size. It means the spacing between the kernel points.
+            If dilation is a tuple, it must contain one integer, `(dilation_size)`.
+            Default: dilation = 1.
+        output_size(int|tuple|list, optional): The output image size. If output size is a
+            tuple, it must contain one integer, `(feature_length)`. None if use
+            filter_size, padding, and stride to calculate output_size.
+            If output_size and filter_size are specified at the same time, They
+            should follow the formula above. Default: None. output_size and filter_size
+            should not be None at the same time.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
+            The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_length]`.
+        name(str, optional): For detailed information, please refer 
+           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           None by default.
+
+    Returns:
+        A  tensor representing the result of 1-D transpose convolution, whose
+        data type is the same with input. And its shape is (num_batches, channels, length)
+        when data_format is `"NCL"` and (num_batches, length, channels) when data_format is
+        `"NLC"`.
+
+    Raises:
+        ValueError: If `data_format` is a string, but not "NCL" or "NLC".
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+            or the element corresponding to the input's channel is not 0.
+        ValueError: If `output_size` and filter_size are None at the same time.
+        ValueError: If `output_padding` is greater than `stride`.
+        ShapeError: If the input is not 3-D Tensor.
+        ShapeError: If the input's dimension size and filter's dimension size not equal.
+        ShapeError: If the dimension size of input minus the size of `stride` is not 1.
+        ShapeError: If the number of input channels is not equal to filter's channels.
+        ShapeError: If the size of `output_size` is not equal to that of `stride`.
+
+    Examples:
+        .. code-block:: python
+
+
+
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          
+          paddle.disable_static()
+          # shape: (1, 2, 4)
+          x=np.array([[[4, 0, 9, 7],
+                       [8, 0, 9, 2,]]]).astype(np.float32)
+          # shape: (2, 1, 2)
+          y=np.array([[[7, 0]],
+                      [[4, 2]]]).astype(np.float32)
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv_transpose1d(x_var, w_var)
+          y_np = y_var.numpy()
+          print y_np
+          
+          # [[[60. 16. 99. 75.  4.]]]
+    """
+    cudnn_version = get_cudnn_version()
+    if cudnn_version is not None:
+        use_cudnn = True
+    else:
+        use_cudnn = False
+
+    if data_format not in ['NCL', 'NLC']:
+        raise ValueError(
+            "Attr(data_format) of conv2d_transpose got wrong value: "
+            "received {}, but only 'NCL' or 'NLC' are supported.".format(
+                data_format))
+    channel_last = (data_format == "NLC")
+    channel_dim = -1 if channel_last else 1
+
+    num_channels = x.shape[channel_dim]
+    if num_channels < 0:
+        raise ValueError("The channel dimmention of the input({}) "
+                         "should be defined. Received: {}.".format(
+                             x.shape, num_channels))
+    if num_channels % groups != 0:
+        raise ValueError(
+            "the channel of input must be divisible by groups,"
+            "received: the channel of input is {}, the shape of input is {}"
+            ", the groups is {}".format(num_channels, x.shape, groups))
+
+    # update attrs
+    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
+
+    if len(padding) == 2:
+        padding = padding + [0] * 2
+    elif len(padding) == 1:
+        padding = padding + [0]
+    else:
+        raise ValueError(
+            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            format(padding))
+
+    stride = utils.convert_to_list(stride, 1, 'stride') + [1]
+    dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
+    output_padding = utils.convert_to_list(output_padding, 1,
+                                           'output_padding') + [0]
+    if output_padding[0] > stride[0]:
+        raise ValueError(
+            "The size of output_padding should not be greater than stride."
+            "But got output_padding={} and stride={}".format(output_padding[0],
+                                                             stride[0]))
+
+    if output_size is None:
+        output_size = []
+    elif isinstance(output_size, (list, tuple, int)):
+        output_size = utils.convert_to_list(output_size, 1, 'output_size') + [1]
+    else:
+        raise ValueError("output_size should be int, or list, tuple of ints")
+
+    op_type = 'conv2d_transpose'
+    num_filters = weight.shape[1]
+    if (num_channels == groups and num_filters == 1 and not use_cudnn):
+        op_type = 'depthwise_conv2d_transpose'
+        use_cudnn = False
+
+    squeeze_axis = -2 if channel_last else -1
+    conv2d_data_format = "NHWC" if channel_last else "NCHW"
+
+    x = nn.unsqueeze(input=x, axes=[squeeze_axis])
+    weight = nn.unsqueeze(input=weight, axes=[-1])
+
+    if in_dygraph_mode():
+        attrs = ('output_size', output_size, 'strides', stride, 'paddings',
+                 padding, 'padding_algorithm', padding_algorithm, 'dilations',
+                 dilation, 'groups', groups, 'use_cudnn', use_cudnn,
+                 'data_format', conv2d_data_format)
+        out = getattr(core.ops, op_type)(x, weight, *attrs)
+        if bias is not None:
+            out = nn.elementwise_add(out, bias, axis=channel_dim)
+    else:
+        inputs = {'Input': [x], 'Filter': [weight]}
+        attrs = {
+            'output_size': output_size,
+            'strides': stride,
+            'paddings': padding,
+            'padding_algorithm': padding_algorithm,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'data_format': conv2d_data_format
+        }
+        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                                 'conv2d_transpose')
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype()
+        out = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Output": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        if bias is not None:
+            out = nn.elementwise_add(out, bias, axis=channel_dim)
+
+    if output_size is None:
+        out = pad2d(
+            out,
+            padding=[0, output_padding, 0, 0],
+            data_format=conv2d_data_format,
+            name=name)
+    out = nn.squeeze(input=out, axes=[squeeze_axis])
+    return out
+
+
+def conv_transpose2d(x,
+                     weight,
+                     bias=None,
+                     stride=1,
+                     padding=0,
+                     output_padding=0,
                      groups=1,
-                     use_cudnn=True,
-                     act=None,
+                     dilation=1,
                      data_format='NCHW',
+                     output_size=None,
                      name=None):
     """
-	:alias_main: paddle.nn.functional.conv2d_transpose
-	:alias: paddle.nn.functional.conv2d_transpose,paddle.nn.functional.conv.conv2d_transpose
 
     The convolution2D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
@@ -350,6 +825,7 @@ def conv2d_transpose(input,
     If bias attribution and activation type are provided, bias is added to
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
+    See more detail in :ref:`api_nn_conv_ConvTranspose2d` .
 
     For each input :math:`X`, the equation is:
 
@@ -398,18 +874,15 @@ def conv2d_transpose(input,
           conv2d_transpose can compute the kernel size automatically.
 
     Args:
-        input(Variable): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format,
+        x(Tensor): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format,
             whose data type is float32 or float64.
-        weight(Variable): The convolution kernel, a Tensor with shape [C, M/g, kH, kW],
+        weight(Tensor): The convolution kernel, a Tensor with shape [C, M/g, kH, kW],
             where M is the number of output channels(filters), g is the number of groups,
             kH is the height of the kernel, and kW is the width of the kernel.
-        bias(Variable, optional): The bias, a Tensor with shape [M, ].
-        output_size(int|tuple|list, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_height, image_width). None if use
-            filter_size, padding, and stride to calculate output_size.
-            If output_size is specified, output_size and filter_size (weight)'s shape 
-            should follow the formula above. Default: None. output_size and filter_size 
-            should not be None at the same time.
+        bias(Tensor, optional): The bias, a Tensor with shape [M, ].
+        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
              `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
              string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
@@ -421,10 +894,9 @@ def conv2d_transpose(input,
             when `data_format` is `'NHWC'`, `padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
-            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
             If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
             Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
@@ -433,10 +905,12 @@ def conv2d_transpose(input,
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups = 1.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
+        output_size(int|tuple|list, optional): The output image size. If output size is a
+            tuple, it must contain two integers, (image_height, image_width). None if use
+            filter_size, padding, and stride to calculate output_size.
+            If output_size is specified, output_size and filter_size (weight)'s shape 
+            should follow the formula above. Default: None. output_size and filter_size 
+            should not be None at the same time.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -446,20 +920,17 @@ def conv2d_transpose(input,
            None by default.
 
     Returns:
-        A Variable holding Tensor representing the conv2d_transpose, whose 
+        A Tensor representing the conv_transpose2d, whose 
         data type is the same with input and shape is (num_batches, channels, out_h, 
-        out_w) or (num_batches, out_h, out_w, channels). If act is None, the tensor variable 
-        storing the transposed convolution result, and if act is not None, the 
-        tensor variable storing transposed convolution and non-linearity activation 
-        result.
+        out_w) or (num_batches, out_h, out_w, channels). The tensor variable storing 
+        transposed convolution result.
 
     Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
         ValueError: If `data_format` is not "NCHW" or "NHWC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and filter_size are None at the same time.
+        ValueError: If `output_size` and kernel_size are None at the same time.
         ShapeError: If the input is not 4-D Tensor.
         ShapeError: If the input's dimension size and filter's dimension size not equal.
         ShapeError: If the dimension size of input minus the size of `stride` is not 2.
@@ -469,28 +940,23 @@ def conv2d_transpose(input,
     Examples:
         .. code-block:: python
 
-          from paddle import fluid
-          import paddle.nn.functional as F
-          import paddle.fluid.dygraph as dg
           import numpy as np
+          import paddle
+          import paddle.nn.functional as F
 
           x = np.random.randn(2, 3, 8, 8).astype(np.float32)
           w = np.random.randn(3, 6, 3, 3).astype(np.float32)
 
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              w_var = dg.to_variable(w)
-              y_var = F.conv2d_transpose(x_var, w_var, act="relu")
-              y_np = y_var.numpy()
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv2d_transpose(x_var, w_var)
+          y_np = y_var.numpy()
           print(y_np.shape)
 
           # (2, 6, 10, 10)
     """
 
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. "
-                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
     if data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
             "Attr(data_format) of conv2d_transpose got wrong value: "
@@ -498,48 +964,65 @@ def conv2d_transpose(input,
                 data_format))
     channel_last = (data_format == "NHWC")
     channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
+    num_channels = x.shape[channel_dim]
     if num_channels < 0:
         raise ValueError("The channel dimmention of the input({}) "
                          "should be defined. Received: {}.".format(
-                             input.shape, num_channels))
+                             x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, input.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups))
+
+    cudnn_version = get_cudnn_version()
+
+    use_cudnn = True if (core.is_compiled_with_cuda() and
+                         cudnn_version is not None) else False
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
     stride = utils.convert_to_list(stride, 2, 'stride')
     dilation = utils.convert_to_list(dilation, 2, 'dilation')
+
     if output_size is None:
         output_size = []
-    elif isinstance(output_size, (list, tuple, int)):
-        output_size = utils.convert_to_list(output_size, 2, 'output_size')
     else:
-        raise ValueError("output_size should be int, or list, tuple of ints")
+        if output_padding != 0:
+            raise ValueError('output_padding option is mutually exclusive with '
+                             'output_size')
+        if isinstance(output_size, (list, tuple, int)):
+            output_size = utils.convert_to_list(output_size, 2, 'output_size')
+        else:
+            raise ValueError(
+                "output_size should be int, or list, tuple of ints")
+
+    if output_padding == 0:
+        output_padding = []
+    else:
+        output_padding = utils.convert_to_list(output_padding, 2,
+                                               'output_padding')
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_filters == 1 and not use_cudnn):
+    if (num_channels == groups and num_filters == 1):
         op_type = 'depthwise_conv2d_transpose'
+        use_cudnn = False
 
     if in_dygraph_mode():
-        attrs = ('output_size', output_size, 'strides', stride, 'paddings',
-                 padding, 'padding_algorithm', padding_algorithm, 'dilations',
-                 dilation, 'groups', groups, 'use_cudnn', use_cudnn,
-                 'data_format', data_format)
-        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
+        attrs = ('output_padding', output_padding, 'output_size', output_size,
+                 'strides', stride, 'paddings', padding, 'padding_algorithm',
+                 padding_algorithm, 'dilations', dilation, 'groups', groups,
+                 'use_cudnn', use_cudnn, 'data_format', data_format)
+        pre_bias = getattr(core.ops, op_type)(x, weight, *attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
+            out = pre_bias
     else:
-        inputs = {'Input': [input], 'Filter': [weight]}
+        inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
+            'output_padding': output_padding,
             'output_size': output_size,
             'strides': stride,
             'paddings': padding,
@@ -549,37 +1032,32 @@ def conv2d_transpose(input,
             'use_cudnn': use_cudnn,
             'data_format': data_format
         }
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'],
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                                  'conv2d_transpose')
         helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype()
-        pre_bias = helper.create_variable_for_type_inference(dtype)
+        pre_bias = helper.create_variable_for_type_inference(x.dtype)
         outputs = {"Output": [pre_bias]}
         helper.append_op(
             type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
+            out = pre_bias
+
     return out
 
 
-def conv3d(input,
+def conv3d(x,
            weight,
            bias=None,
-           padding=0,
            stride=1,
+           padding=0,
            dilation=1,
            groups=1,
-           use_cudnn=True,
-           act=None,
            data_format="NCDHW",
            name=None):
     """
-	:alias_main: paddle.nn.functional.conv3d
-	:alias: paddle.nn.functional.conv3d,paddle.nn.functional.conv.conv3d
 
     The convolution3D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
@@ -625,12 +1103,15 @@ def conv3d(input,
             W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
 
     Args:
-        input (Variable): The input is 5-D Tensor with shape [N, C, D, H, W], the data 
+        x (Tensor): The input is 5-D Tensor with shape [N, C, D, H, W], the data 
             type of input is float16 or float32 or float64.
         weight (Variable): The convolution kernel, a Tensor with shape [M, C/g, kD, kH, kW],
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
-        bias (Variable, optional): The bias, a Tensor of shape [M, ].
+        bias (Tensor, optional): The bias, a Tensor of shape [M, ].
+        stride (int|tuple): The stride size. It means the stride in convolution. If stride is a 
+            tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
+            Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
             on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
@@ -641,9 +1122,6 @@ def conv3d(input,
             when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        stride (int|tuple): The stride size. It means the stride in convolution. If stride is a 
-            tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
-            Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel points. 
             If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height,
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
@@ -653,10 +1131,6 @@ def conv3d(input,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -666,13 +1140,12 @@ def conv3d(input,
            None by default.
 
     Returns:
-        A Variable holding Tensor representing the conv3d, whose data type is 
+        A Tensor representing the conv3d, whose data type is 
         the same with input. If act is None, the tensor variable storing the 
         convolution result, and if act is not None, the tensor variable storing 
         convolution and non-linearity activation result.
 
     Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
         ValueError: If the channel dimmention of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
@@ -706,10 +1179,6 @@ def conv3d(input,
             # (2, 6, 6, 6, 6)
     """
     # entry check
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. Received "
-                         "Attr(use_cudnn): {}. ".format(use_cudnn))
-
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
@@ -717,12 +1186,12 @@ def conv3d(input,
 
     channel_last = (data_format == "NDHWC")
     channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
+    num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError(
             "The channel dimmention of the input({}) should be defined. "
-            "Received: {}.".format(input.shape, num_channels))
+            "Received: {}.".format(x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
@@ -734,6 +1203,10 @@ def conv3d(input,
             "Received: number of filters({}), groups({}).".format(num_filters,
                                                                   groups))
 
+    cudnn_version = get_cudnn_version()
+    use_cudnn = True if (core.is_compiled_with_cuda() and
+                         cudnn_version is not None) else False
+
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
     stride = utils.convert_to_list(stride, 3, 'stride')
     dilation = utils.convert_to_list(dilation, 3, 'dilation')
@@ -744,15 +1217,13 @@ def conv3d(input,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
                  "padding_algorithm", padding_algorithm, "data_format",
                  data_format)
-        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
+        pre_bias = getattr(core.ops, op_type)(x, weight, *attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
+            out = pre_bias
     else:
-        inputs = {'Input': [input], 'Filter': [weight]}
+        inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
             'strides': stride,
             'paddings': padding,
@@ -765,8 +1236,8 @@ def conv3d(input,
         }
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype()
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'conv3d')
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'conv3d')
 
         pre_bias = helper.create_variable_for_type_inference(dtype)
         outputs = {"Output": [pre_bias]}
@@ -774,31 +1245,26 @@ def conv3d(input,
         helper.append_op(
             type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
+            out = pre_bias
 
     return out
 
 
-def conv3d_transpose(input,
+def conv_transpose3d(x,
                      weight,
                      bias=None,
-                     output_size=None,
-                     padding=0,
                      stride=1,
-                     dilation=1,
+                     padding=0,
+                     output_padding=0,
                      groups=1,
-                     use_cudnn=True,
-                     act=None,
+                     dilation=1,
                      data_format='NCDHW',
+                     output_size=None,
                      name=None):
     """
-	:alias_main: paddle.nn.functional.conv3d_transpose
-	:alias: paddle.nn.functional.conv3d_transpose,paddle.nn.functional.conv.conv3d_transpose
-
-    The convolution3D transpose layer calculates the output based on the input,
+    The convolution3d transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
     are in NCDHW or NDHWC format. Where N is batch size, C is the number of channels,
     D is the depth of the feature, H is the height of the feature, and W
@@ -809,6 +1275,7 @@ def conv3d_transpose(input,
     If bias attribution and activation type are provided, bias is added to
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
+    See more detail in :ref:`api_nn_conv_ConvTranspose3d` .
 
     For each input :math:`X`, the equation is:
 
@@ -861,17 +1328,16 @@ def conv3d_transpose(input,
           conv3d_transpose can compute the kernel size automatically.
 
     Args:
-        input(Variable): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
+        x(Tensor): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
             of input is float32 or float64.
-        weight (Variable): The convolution kernel, a Tensor with shape [C, M/g, kD, kH, kW],
+        weight (Tensor): The convolution kernel, a Tensor with shape [C, M/g, kD, kH, kW],
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
-        bias (Variable, optional): The bias, a Tensor of shape [M, ].
-        output_size(int|tuple, optional): The output image size. If output size is a
-            tuple, it must contain three integers, (image_depth, image_height, image_width). This
-            parameter only works when filter_size is None. If output_size and filter_size are 
-            specified at the same time, They should follow the formula above. Default: None. 
-            Output_size and filter_size should not be None at the same time.
+        bias (Tensor, optional): The bias, a Tensor of shape [M, ].
+        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
+            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
+            Default: stride = 1.
         padding(int|list|str|tuple, optional): The padding size. The padding argument effectively
              adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string,
              either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding`
@@ -882,11 +1348,9 @@ def conv3d_transpose(input,
             when `data_format` is `'NDHWC'`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
-            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
-            Default: stride = 1.
-        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
             If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
@@ -896,32 +1360,32 @@ def conv3d_transpose(input,
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups=1
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_height, input_width]`.
+        output_size(int|list|tuple, optional): The output image size. If output size is a
+            tuple, it must contain three integers, (image_depth, image_height, image_width). This
+            parameter only works when filter_size is None. If output_size and filter_size are 
+            specified at the same time, They should follow the formula above. Default: None. 
+            Output_size and filter_size should not be None at the same time.
         name(str, optional): For detailed information, please refer 
            to :ref:`api_guide_Name`. Usually name is no need to set and 
            None by default.
 
     Returns:
-        A Variable holding Tensor representing the conv3d_transpose, whose data 
+        A Tensor representing the conv_transpose3d, whose data 
         type is the same with input and shape is (num_batches, channels, out_d, out_h, 
         out_w) or (num_batches, out_d, out_h, out_w, channels). If act is None, the tensor 
         variable storing the transposed convolution result, and if act is not None, the tensor 
         variable storing transposed convolution and non-linearity activation result.
 
     Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and filter_size are None at the same time.
+        ValueError: If `output_size` and kernel_size are None at the same time.
         ShapeError: If the input is not 5-D Tensor.
         ShapeError: If the input's dimension size and filter's dimension size not equal.
         ShapeError: If the dimension size of input minus the size of `stride` is not 2.
@@ -930,29 +1394,26 @@ def conv3d_transpose(input,
 
     Examples:
        .. code-block:: python
+          
+          import numpy as np
 
-          from paddle import fluid
+          import paddle
           import paddle.nn.functional as F
-          import paddle.fluid.dygraph as dg
-          import numpy as np
 
           x = np.random.randn(2, 3, 8, 8, 8).astype(np.float32)
           w = np.random.randn(3, 6, 3, 3, 3).astype(np.float32)
 
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              w_var = dg.to_variable(w)
-              y_var = F.conv3d_transpose(x_var, w_var, act="relu")
-              y_np = y_var.numpy()
+          paddle.disable_static()
+
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv_transpose3d(x_var, w_var)
+          y_np = y_var.numpy()
           print(y_np.shape)
 
           # (2, 6, 10, 10, 10)
     """
     # entry checks
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. "
-                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
@@ -960,12 +1421,12 @@ def conv3d_transpose(input,
 
     channel_last = (data_format == "NDHWC")
     channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
+    num_channels = x.shape[channel_dim]
     num_filters = weight.shape[1]
     if num_channels < 0:
         raise ValueError(
             "The channel dimmention of the input({}) should be defined. "
-            "Received: {}.".format(input.shape, num_channels))
+            "Received: {}.".format(x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
@@ -977,29 +1438,45 @@ def conv3d_transpose(input,
     dilation = utils.convert_to_list(dilation, 3, 'dilation')
     if output_size is None:
         output_size = []
-    elif isinstance(output_size, (list, tuple, int)):
-        output_size = utils.convert_to_list(output_size, 3, 'output_size')
     else:
-        raise ValueError("output_size should be int, or list, tuple of ints")
+        if output_padding != 0:
+            raise ValueError('output_padding option is mutually exclusive with '
+                             'output_size')
+        if isinstance(output_size, (list, tuple, int)):
+            output_size = utils.convert_to_list(output_size, 3, 'output_size')
+        else:
+            raise ValueError(
+                "output_size should be int, or list, tuple of ints")
+
+    if output_padding == 0:
+        output_padding = []
+    else:
+        output_padding = utils.convert_to_list(output_padding, 3,
+                                               'output_padding')
+
+    cudnn_version = get_cudnn_version()
+
+    #TODO(LielinJiang): whether to use cudnn according to the version of cudnn
+    use_cudnn = True if (core.is_compiled_with_cuda() and
+                         cudnn_version is not None) else False
 
     op_type = 'conv3d_transpose'
     data_format_ = "NHWC" if channel_last else "NCHW"
 
     if in_dygraph_mode():
-        attrs = ('output_size', output_size, 'paddings', padding,
-                 "padding_algorithm", padding_algorithm, 'strides', stride,
-                 'dilations', dilation, 'groups', groups, 'use_cudnn',
-                 use_cudnn, "data_format", data_format_)
-        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
+        attrs = ('output_padding', output_padding, 'output_size', output_size,
+                 'paddings', padding, "padding_algorithm", padding_algorithm,
+                 'strides', stride, 'dilations', dilation, 'groups', groups,
+                 'use_cudnn', use_cudnn, "data_format", data_format_)
+        pre_bias = getattr(core.ops, op_type)(x, weight, *attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
+            out = pre_bias
     else:
-        inputs = {'Input': [input], 'Filter': [weight]}
+        inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
+            'output_padding': output_padding,
             'output_size': output_size,
             'paddings': padding,
             "padding_algorithm": padding_algorithm,
@@ -1010,19 +1487,17 @@ def conv3d_transpose(input,
             "data_format": data_format_
         }
         helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype()
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'conv3d')
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'conv3d')
 
-        pre_bias = helper.create_variable_for_type_inference(dtype)
+        pre_bias = helper.create_variable_for_type_inference(x.dtype)
         outputs = {"Output": [pre_bias]}
 
         helper.append_op(
             type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
+            out = pre_bias
 
     return out
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
new file mode 100644
index 0000000000000000000000000000000000000000..e77bf0e39672984f7076938b134f3e54f4c761ab
--- /dev/null
+++ b/python/paddle/nn/functional/input.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import warnings
+from ...fluid.framework import Variable, in_dygraph_mode
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.layers import core
+from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
+
+__all__ = ['one_hot']
+
+
+def one_hot(x, num_classes, name=None):
+    """
+
+    The operator converts each id in the input 'x' to an one-hot vector with a
+    num_classes length. The value in the vector dimension corresponding to the id
+    is 1, and the value in the remaining dimension is 0.
+
+    The shape of output Tensor is generated by appending num_classes dimension
+    behind the last dimension of the 'x' shape.
+
+    .. code-block:: text
+
+        Example 1:
+
+        input:
+            x.shape = [4]
+            x.data = [1, 1, 3, 0]
+            num_classes = 4
+
+        output:
+            Out.shape = [4, 4]
+            Out.data = [[0., 1., 0., 0.],
+                        [0., 1., 0., 0.],
+                        [0., 0., 0., 1.],
+                        [1., 0., 0., 0.]]
+
+        Example 2:
+
+        input:
+            x.shape = [4]
+            x.data = [1, 1, 5, 0]
+            num_classes = 4
+
+        output: Throw an exception for Illegal value
+            The second dimension in X is 5, which is greater than num_classes,
+            so it throws an exception.
+
+
+    Args:
+        x(Tensor): Tensor with shape :math:`[N_1, N_2, ..., N_k]` ,
+            which contains at least one dimension. The data type is int32 or int64.
+        num_classes(int): An integer defining the num_classes of the one hot dimension. If input 'x'
+            is word id, num_classes is generally the dictionary size.
+
+    Returns:
+        Tensor: The one-hot representations of 'x'. A Tensor with type float32.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            # Correspond to the first example above, where label.shape is 4 and one_hot_label.shape is [4, 4].
+            label = paddle.data(name="label", shape=[4, 1], dtype="int64")
+            # label.shape = [4]
+            # label.data = [1, 1, 3, 0]
+            one_hot_label = paddle.nn.functional.one_hot(x=label, num_classes=4)
+            # one_hot_label.shape = [4, 4]
+            # one_hot_label.data = [[0., 1., 0., 0.],
+            #                       [0., 1., 0., 0.],
+            #                       [0., 0., 0., 1.],
+            #                       [1., 0., 0., 0.]]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.one_hot_v2(x, 'depth', num_classes,
+                                   'allow_out_of_range', False)
+    else:
+        check_variable_and_dtype(x, 'input', ['int32', 'int64'], 'one_hot_v2')
+        helper = LayerHelper("one_hot_v2", **locals())
+
+        one_hot_out = helper.create_variable_for_type_inference(dtype='float32')
+        if not isinstance(num_classes, Variable):
+            # user attribute 
+            inputs = {'X': x}
+            attrs = {'depth': num_classes, 'allow_out_of_range': False}
+        else:
+            num_classes.stop_gradient = True
+            inputs = {'X': x, 'depth_tensor': num_classes}
+            attrs = {'allow_out_of_range': False}
+        helper.append_op(
+            type="one_hot_v2",
+            inputs=inputs,
+            attrs=attrs,
+            outputs={'Out': one_hot_out},
+            stop_gradient=True)
+        return one_hot_out
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index f8bc0b1b54e9639a41d3975f894b3a9873014b38..55bb36d136405385a88b991576c2a9091437d456 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -13,8 +13,11 @@
 # limitations under the License.
 
 import paddle
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.data_feeder import check_variable_and_dtype
+import paddle.fluid as fluid
 
-# TODO: define loss functions of neural network  
+# TODO: define loss functions of neural network
 import numpy as np
 import paddle
 import paddle.fluid as fluid
@@ -22,7 +25,6 @@ from ...fluid.framework import core, in_dygraph_mode
 from ...fluid.layers.nn import _elementwise_op_in_dygraph
 from ...fluid.layers import bpr_loss  #DEFINE_ALIAS
 from ...fluid.layers import center_loss  #DEFINE_ALIAS
-from ...fluid.layers import cross_entropy  #DEFINE_ALIAS
 from ...fluid.layers import dice_loss  #DEFINE_ALIAS
 from ...fluid.layers import iou_similarity  #DEFINE_ALIAS
 from ...fluid.layers import log_loss  #DEFINE_ALIAS
@@ -42,9 +44,12 @@ from ...fluid.layers import huber_loss  #DEFINE_ALIAS
 from ...fluid.layers import sampled_softmax_with_cross_entropy  #DEFINE_ALIAS
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode
+from ...fluid.framework import _varbase_creator
 from ...fluid.framework import Variable
 
 __all__ = [
+    'binary_cross_entropy',
+    'binary_cross_entropy_with_logits',
     'bpr_loss',
     'center_loss',
     'cross_entropy',
@@ -65,13 +70,364 @@ __all__ = [
     'sigmoid_cross_entropy_with_logits',
     'sigmoid_focal_loss',
     'smooth_l1',
+    'smooth_l1_loss',
     'softmax_with_cross_entropy',
     'square_error_cost',
     'ssd_loss',
-    'teacher_student_sigmoid_loss'
+    'teacher_student_sigmoid_loss',
+    'ctc_loss',
 ]
 
 
+def binary_cross_entropy(input, label, weight=None, reduction='mean',
+                         name=None):
+    """
+    This op measures the binary_cross_entropy loss between input predictions ``input``
+    and target labels ``label`` . The binary_cross_entropy loss can be described as:
+
+    If :attr:`weight` is set, the loss is:
+
+    .. math::
+        Out = -1 * weight * (label * log(input) + (1 - label) * log(1 - input))
+
+    If :attr:`weight` is None, the loss is:
+
+    .. math::
+        Out = -1 * (label * log(input) + (1 - label) * log(1 - input))
+
+    If :attr:`reduction` set to ``'none'``, the interface will return the original loss `Out`.
+
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
+
+    .. math::
+        Out = MEAN(Out)
+
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
+
+    .. math::
+        Out = SUM(Out)
+
+    Note that the input predictions ``input`` always be the output of sigmoid, and the target labels ``label``
+    should be numbers between 0 and 1.
+
+    Parameters:
+        input (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
+            N is batch_size, `*` means number of additional dimensions. The ``input``
+            should always be the output of sigmod.  Available dtype is float32, float64.
+        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+            ``input``. The target labels which values should be numbers between 0 and 1.
+            Available dtype is float32, float64.
+        weight (Tensor, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, has to be a Tensor of size nbatch and the data type
+            is float32, float64. Default is ``'None'``.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+
+    Returns:
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+            same as ``input`` , else the shape of output is scalar.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            input_data = np.array([0.5, 0.6, 0.7]).astype("float32")
+            label_data = np.array([1.0, 0.0, 1.0]).astype("float32")
+
+            paddle.disable_static()
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            output = paddle.nn.functional.binary_cross_entropy(input, label)
+            print(output.numpy())  # [0.65537095]
+            paddle.enable_static()
+
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in binary_cross_entropy should be 'sum', "
+            "'mean' or 'none', but received %s, which is not allowed." %
+            reduction)
+
+    if in_dygraph_mode():
+        out = core.ops.bce_loss(input, label)
+        if weight is not None:
+            out = core.ops.elementwise_mul(out, weight, 'axis', -1)
+
+        if reduction == 'sum':
+            return core.ops.reduce_sum(out, 'dim', [0], 'keep_dim', False,
+                                       "reduce_all", True)
+        elif reduction == 'mean':
+            return core.ops.reduce_mean(out, 'dim', [0], 'keep_dim', False,
+                                        "reduce_all", True)
+        else:
+            return out
+
+    fluid.data_feeder.check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'binary_cross_entropy')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'binary_cross_entropy')
+
+    sub_name = name if weight is None and reduction is 'none' else None
+    helper = LayerHelper("binary_cross_entropy", name=sub_name)
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='bce_loss',
+        inputs={
+            'X': [input],
+            'Label': [label],
+        },
+        outputs={'Out': [out]})
+
+    if weight is not None:
+        if isinstance(weight, paddle.framework.Variable):
+            weight_name = name if reduction is 'none' else None
+            out = paddle.multiply(out, weight, axis=-1, name=weight_name)
+        else:
+            raise ValueError(
+                "The weight is not a Tensor, please convert to Tensor.")
+
+    if reduction == 'sum':
+        return paddle.sum(out, name=name)
+    elif reduction == 'mean':
+        return paddle.mean(out, name=name)
+    else:
+        return out
+
+
+def binary_cross_entropy_with_logits(logit,
+                                     label,
+                                     weight=None,
+                                     reduction='mean',
+                                     pos_weight=None,
+                                     name=None):
+    """
+    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
+    layer and some reduce operations.
+
+    This measures the element-wise probability error in classification tasks
+    in which each class is independent.
+    This can be thought of as predicting labels for a data-point, where labels
+    are not mutually exclusive. For example, a news article can be about
+    politics, technology or sports at the same time or none of these.
+
+    First this operator calculate loss function as follows:
+
+    .. math::
+           Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit))
+
+    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get:
+
+    .. math::
+           Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit})
+
+    For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0,
+    we reformulate the loss as follows:
+
+    .. math::
+           Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|})
+
+    Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
+    weight tensor on the loss `Out`. The ``weight`` tensor will attach different
+    weight on every items in the batch. The ``pos_weight`` will attach different
+    weight on the positive label of each class.
+
+    Finally, this operator applies reduce operation on the loss.
+    If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
+
+    Note that the target labels ``label`` should be numbers between 0 and 1.
+
+    Args:
+        logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
+            N is batch_size, `*` means number of additional dimensions. The ``logit``
+            is usually the output of Linear layer. Available dtype is float32, float64.
+        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+            ``logit``. The target labels which values should be numbers between 0 and 1.
+            Available dtype is float32, float64.
+        weight (Tensor, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, it has to be a 1D Tensor whose size is `[N, ]`,
+            The data type is float32, float64. Default is ``'None'``.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default is ``'mean'``.
+        pos_weight (Tensor, optional): A weight of positive examples. Must be a vector
+            with length equal to the number of classes. The data type is float32, float64.
+            Default is ``'None'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+            same as ``logit`` , else the shape of output is scalar.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
+            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
+            output = paddle.nn.functional.binary_cross_entropy_with_logits(logit, label)
+            print(output.numpy())  # [0.45618808]
+
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in binary_cross_entropy_with_logits "
+            "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
+            % reduction)
+
+    if in_dygraph_mode():
+        one = _varbase_creator(dtype=logit.dtype)
+        core.ops.fill_constant(one, 'value',
+                               float(1.0), 'force_cpu', False, 'dtype',
+                               one.dtype, 'str_value', '1.0', 'shape', [1])
+        out = core.ops.sigmoid_cross_entropy_with_logits(logit, label)
+        if pos_weight is not None:
+            log_weight = core.ops.elementwise_add(
+                core.ops.elementwise_mul(
+                    label, core.ops.elementwise_sub(pos_weight, one)), one)
+            out = core.ops.elementwise_mul(out, log_weight)
+        if weight is not None:
+            out = core.ops.elementwise_mul(out, weight)
+
+        if reduction == "sum":
+            return core.ops.reduce_sum(out, 'reduce_all', True)
+        elif reduction == "mean":
+            return core.ops.mean(out)
+        else:
+            return out
+
+    fluid.data_feeder.check_variable_and_dtype(
+        logit, 'logit', ['float32', 'float64'],
+        'binary_cross_entropy_with_logits')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'],
+        'binary_cross_entropy_with_logits')
+    sigmoid_name = None
+    if reduction == 'none' and pos_weight is None and weight is None:
+        sigmoid_name = name
+
+    out = paddle.nn.functional.sigmoid_cross_entropy_with_logits(
+        logit, label, name=sigmoid_name)
+
+    one = paddle.fill_constant(shape=[1], value=1.0, dtype=logit.dtype)
+    if pos_weight is not None:
+        fluid.data_feeder.check_variable_and_dtype(
+            pos_weight, 'pos_weight', ['float32', 'float64'],
+            'binary_cross_entropy_with_logits')
+        log_weight = paddle.add(
+            paddle.multiply(label, paddle.elementwise_sub(pos_weight, one)),
+            one)
+        pos_weight_name = name if reduction == 'none' and weight is None else None
+        out = paddle.multiply(out, log_weight, name=pos_weight_name)
+
+    if weight is not None:
+        fluid.data_feeder.check_variable_and_dtype(
+            weight, 'weight', ['float32', 'float64'],
+            'binary_cross_entropy_with_logits')
+        weight_name = name if reduction == 'none' else None
+        out = paddle.multiply(out, weight, name=weight_name)
+
+    if reduction == "sum":
+        return paddle.sum(out, name=name)
+    elif reduction == "mean":
+        return paddle.mean(out, name=name)
+    return out
+
+
+def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
+    """
+    This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
+    term if the absolute element-wise error falls below 1 and an L1 term otherwise.
+    In some cases it can prevent exploding gradients and it is more robust and less
+    sensitivity to outliers. Also known as the Huber loss:
+
+    .. math::
+
+         loss(x,y)=\\frac{1}{n}\\sum_{i}z_i
+
+
+    where z_i is given by:
+
+    .. math::
+
+         \\mathop{z_i}=\\left\\{\\begin{array}{rcl}
+        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\\\
+        delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise}
+        \\end{array} \\right.
+
+    Parameters:
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is
+            (N, C), where C is number of classes, and if shape is more than 2D, this
+            is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor, the data type is float32 or float64. The shape of label
+            is the same as the shape of input.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
+            Default is ``'mean'``.
+        delta (float, optional): Specifies the hyperparameter delta to be used.
+            The value determines how large the errors need to be to use L1. Errors
+            smaller than delta are minimized with L2. Parameter is ignored for
+            negative/zero values. Default = 1.0
+        name (str, optional): Name for the operation (optional, default is
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        The tensor variable storing the smooth_l1_loss of input and label.
+
+    Return type: Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            input_data = np.random.rand(3,3).astype("float32")
+            label_data = np.random.rand(3,3).astype("float32")
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            output = paddle.nn.functioanl.smooth_l1_loss(input, label)
+            print(output.numpy())
+    """
+    fluid.data_feeder.check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'smooth_l1_loss')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'smooth_l1_loss')
+
+    out = huber_loss(input=input, label=label, delta=delta)
+
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in smooth_l1_loss should be 'sum', 'mean' or"
+            " 'none', but received %s, which is not allowed." % reduction)
+    if reduction == 'none':
+        return out
+    elif reduction == 'mean':
+        return fluid.layers.reduce_mean(out)
+    elif reduction == 'sum':
+        return fluid.layers.reduce_sum(out)
+
+
 def margin_ranking_loss(input,
                         other,
                         label,
@@ -80,9 +436,9 @@ def margin_ranking_loss(input,
                         name=None):
     """
 
-    This op the calcluate the the margin rank loss between the input, other and label, use the math function as follows. 
+    This op the calcluate the the margin rank loss between the input, other and label, use the math function as follows.
 
-    .. math:: 
+    .. math::
         margin\_rank\_loss = max(0, -label * (input - other) + margin)
 
     If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
@@ -100,7 +456,7 @@ def margin_ranking_loss(input,
     Parameters:
         input(Tensor): the first input tensor, it's data type should be float32, float64.
         other(Tensor): the second input tensor, it's data type should be float32, float64.
-        label(Tensor): the label value corresponding to input, it's data type should be float32, float64. 
+        label(Tensor): the label value corresponding to input, it's data type should be float32, float64.
         margin (float, optional): The margin value to add, default value is 0;
         reduction (str, optional): Indicate the reduction to apply to the loss, the candicates are ``'none'``, ``'mean'``, ``'sum'``.If :attr:`reduction` is ``'none'``, the unreduced loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. Default is ``'mean'``.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
@@ -111,17 +467,21 @@ def margin_ranking_loss(input,
 
         .. code-block:: python
 
-            import numpy as np 
-            import paddle 
-            
+            import numpy as np
+            import paddle
+
             paddle.disable_static()
-             
+
             input = paddle.to_variable(np.array([[1, 2], [3, 4]]).astype('float32'))
             other = paddle.to_variable(np.array([[2, 1], [2, 4]]).astype('float32'))
             label = paddle.to_variable(np.array([[1, -1], [-1, -1]]).astype('float32'))
-            loss = paddle.nn.functional.margin_ranking_loss(input, other, label) 
+            loss = paddle.nn.functional.margin_ranking_loss(input, other, label)
             print(loss.numpy()) # [0.75]
     """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
+            "received %s, which is not allowed." % reduction)
     if fluid.framework.in_dygraph_mode():
         out = core.ops.elementwise_sub(other, input)
         out = core.ops.elementwise_mul(out, label)
@@ -195,15 +555,15 @@ def l1_loss(input, label, reduction='mean', name=None):
     .. math::
         Out = SUM(\lvert input - label\rvert)
 
-    
+
     Parameters:
         input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
         label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
-        reduction (str, optional): Indicate the reduction to apply to the loss, 
+        reduction (str, optional): Indicate the reduction to apply to the loss,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If `reduction` is ``'none'``, the unreduced loss is returned; 
-            If `reduction` is ``'mean'``, the reduced mean loss is returned. 
-            If `reduction` is ``'sum'``, the reduced sum loss is returned. 
+            If `reduction` is ``'none'``, the unreduced loss is returned;
+            If `reduction` is ``'mean'``, the reduced mean loss is returned.
+            If `reduction` is ``'sum'``, the reduced sum loss is returned.
             Default is ``'mean'``.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
     Returns:
@@ -214,7 +574,7 @@ def l1_loss(input, label, reduction='mean', name=None):
         .. code-block:: python
             import paddle
             import numpy as np
-            
+
             paddle.disable_static()
             input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
             label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
@@ -222,16 +582,16 @@ def l1_loss(input, label, reduction='mean', name=None):
             label = paddle.to_variable(label_data)
 
             l1_loss = paddle.nn.functional.l1_loss(input, label)
-            print(l1_loss.numpy())  
+            print(l1_loss.numpy())
             # [0.35]
 
             l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='none')
-            print(l1_loss.numpy())  
+            print(l1_loss.numpy())
             # [[0.20000005 0.19999999]
             # [0.2        0.79999995]]
 
             l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
-            print(l1_loss.numpy())  
+            print(l1_loss.numpy())
             # [1.4]
     """
     if reduction not in ['sum', 'mean', 'none']:
@@ -387,21 +747,21 @@ def kl_div(input, label, reduction='mean', name=None):
     While :math:`x` is input and :math:`y` is label.
 
     While :attr:`reduction` is :attr:`none`, output loss is in
-    the same shape as input, loss in each point is calculated 
+    the same shape as input, loss in each point is calculated
     seperately and no reduction is applied.
-    
+
     While :attr:`reduction` is :attr:`mean`, output loss is in
     shape of [1] and loss value is the mean value of all losses.
-    
+
     While :attr:`reduction` is :attr:`sum`, output loss is in
     shape of [1] and loss value is the sum value of all losses.
-    
-    While :attr:`reduction` is :attr:`batchmean`, output loss is 
+
+    While :attr:`reduction` is :attr:`batchmean`, output loss is
     in shape of [1] and loss value is the sum value of all losses
     divided by batch size.
 
     Args:
-        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means 
+        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means
              any number of additional dimensions. It's data type should be float32, float64.
         label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64.
         reduction (Tensor): Indicate how to average the loss,
@@ -411,7 +771,7 @@ def kl_div(input, label, reduction='mean', name=None):
              if `reduction` is ``'sum'``, the reduced sum loss is returned;
              if `reduction` is ``'none'``, no reduction will be apllied.
              Default is ``'mean'``.
-        name(str, optional): Name for the operation (optional, default is None). For more information, 
+        name(str, optional): Name for the operation (optional, default is None). For more information,
             please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -423,9 +783,9 @@ def kl_div(input, label, reduction='mean', name=None):
             import paddle
             import numpy as np
             import paddle.nn.functional as F
-            
+
             paddle.enable_imperative()
-            
+
             shape = (5, 20)
             input = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
@@ -434,7 +794,7 @@ def kl_div(input, label, reduction='mean', name=None):
             pred_loss = F.kl_div(paddle.to_variable(input),
                                  paddle.to_variable(target), reduction='batchmean')
             # shape=[5]
-            
+
             # 'mean' reduction, loss shape will be [1]
             pred_loss = F.kl_div(paddle.to_variable(input),
                                  paddle.to_variable(target), reduction='mean')
@@ -508,7 +868,7 @@ def mse_loss(input, label, reduction='mean', name=None):
         Tensor: The tensor tensor storing the mean square error difference of input and label.
 
     Return type: Tensor.
-    
+
     Examples:
 
         .. code-block:: python
@@ -569,3 +929,229 @@ def mse_loss(input, label, reduction='mean', name=None):
         return paddle.sum(paddle.fluid.layers.square(
             paddle.fluid.layers.elementwise_sub(input, label)),
                           name=name)
+
+
+def ctc_loss(log_probs,
+             labels,
+             input_lengths,
+             label_lengths,
+             blank=0,
+             reduction='mean'):
+    """
+
+    An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
+    to compute Connectionist Temporal Classification (CTC) loss.
+    It can be aliased as softmax with CTC, since a native softmax activation
+    is interated to the Warp-CTC library to normalize values for each row of the input tensor.
+
+    Parameters:
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
+        input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
+        label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
+        blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
+        reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
+
+    Returns:
+        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
+
+    Examples:
+
+        .. code-block:: python
+
+            # declarative mode
+            import paddle.nn.functional as F
+            import numpy as np
+            import paddle
+
+            # length of the longest logit sequence
+            max_seq_length = 4
+            #length of the longest label sequence
+            max_label_length = 3
+            # number of logit sequences
+            batch_size = 2
+            # class num
+            class_num = 3
+
+            np.random.seed(1)
+            log_probs = np.array([[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
+                                    [3.02332580e-01, 1.46755889e-01, 9.23385918e-02]],
+
+                                    [[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
+                                    [5.38816750e-01, 4.19194520e-01, 6.85219526e-01]],
+
+                                    [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02],
+                                    [6.70467496e-01, 4.17304814e-01, 5.58689833e-01]],
+
+                                    [[1.40386939e-01, 1.98101491e-01, 8.00744593e-01],
+                                    [9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
+
+                                    [[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
+                                    [3.90547849e-02, 1.69830427e-01, 8.78142476e-01]]]).astype("float32")
+            labels = np.array([[1, 2, 2],
+                            [1, 2, 2]]).astype("int32")
+            input_lengths = np.array([5, 5]).astype("int64")
+            label_lengths = np.array([3, 3]).astype("int64")
+
+            paddle.disable_static()
+            log_probs = paddle.to_tensor(log_probs)
+            labels = paddle.to_tensor(labels)
+            input_lengths = paddle.to_tensor(input_lengths)
+            label_lengths = paddle.to_tensor(label_lengths)
+
+            loss = F.ctc_loss(log_probs, labels,
+                input_lengths,
+                label_lengths,
+                blank=0,
+                reduction='none')
+            print(loss.numpy())  #[3.9179852 2.9076521]
+
+            loss = F.ctc_loss(log_probs, labels,
+                input_lengths,
+                label_lengths,
+                blank=0,
+                reduction='mean')
+            print(loss.numpy())  #[1.1376063]
+
+    """
+
+    loss_out = fluid.layers.warpctc(log_probs, labels, blank, False,
+                                    input_lengths, label_lengths)
+
+    loss_out = fluid.layers.squeeze(loss_out, [-1])
+    assert reduction in ['mean', 'sum', 'none']
+    if reduction == 'mean':
+        loss_out = paddle.mean(loss_out / paddle.cast(label_lengths,
+                                                      loss_out.dtype))
+    elif reduction == 'sum':
+        loss_out = paddle.sum(loss_out)
+    return loss_out
+
+
+def cross_entropy(input,
+                  label,
+                  weight=None,
+                  ignore_index=-100,
+                  reduction='mean'):
+    """
+    This operator implements the cross entropy loss function. This OP combines ``LogSoftmax``,
+    and ``NLLLoss`` together.
+
+    It is useful when training a classification problem with ``C`` classes.
+    If provided, the optional argument ``weight`` should be a 1D Variable assigning
+    weight to each of the classes.
+
+    For predictions label, and target label, the loss is calculated as follows.
+
+    .. math::
+
+        loss_j =  -\\text{input[class]} +
+        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{input}_i)\\right), j = 1,..., K
+
+    If weight is not ``None``:
+
+    .. math::
+
+        loss_j =  \\text{weight[class]}(-\\text{input[class]} +
+        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{input}_i)\\right)), j = 1,..., K
+
+    Parameters:
+        input (Tensor): Input tensor, the data type is float32, float64. Shape is
+	    (N, C), where C is number of classes, and if shape is more than 2D, this
+	    is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+	    (N, D1, D2,..., Dk), k >= 1.
+        weight (Tensor, optional): Weight tensor, a manual rescaling weight given
+            to each class and the shape is (C). It has the same dimensions as class
+	    number and the data type is float32, float64. Default is ``'None'``.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
+            Default is ``'mean'``.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default is ``-100``.
+
+    Returns:
+        The tensor variable storing the cross_entropy_loss of input and label.
+
+    Return type: Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+            input_data = np.random.random([5, 100]).astype("float64")
+            label_data = np.random.randint(0, 100, size=(5)).astype(np.int64)
+            weight_data = np.random.random([100]).astype("float64")
+            input =  paddle.to_tensor(input_data)
+            label =  paddle.to_tensor(label_data)
+            weight = paddle.to_tensor(weight_data)
+            loss = paddle.nn.functional.cross_entropy(input=input, label=label, weight=weight)
+            print(loss.numpy())
+
+    """
+    if not in_dygraph_mode():
+        fluid.data_feeder.check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'cross_entropy_loss')
+        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
+                                                   'cross_entropy_loss')
+
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in cross_entropy_loss should be 'sum', 'mean' or"
+            " 'none', but received %s, which is not allowed." % reduction)
+
+    #step 1. log_softmax
+    log_softmax_out = paddle.nn.functional.log_softmax(input)
+    if weight is not None and not isinstance(weight, Variable):
+        raise ValueError(
+            "The weight' is not a Variable, please convert to Variable.")
+
+    #step 2. nll_loss
+    input = log_softmax_out
+    helper = LayerHelper('nll_loss', **locals())
+    dtype = helper.input_dtype(input)
+
+    if not in_dygraph_mode():
+        fluid.data_feeder.check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'nll_loss')
+        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
+                                                   'nll_loss')
+
+    x_shape = list(input.shape)
+    n = x_shape[0]
+    c = x_shape[1]
+    x_dims = len(x_shape)
+    if x_dims < 2:
+        raise ValueError('Expected 2 or more dimensions (got {})'.format(
+            x_dims))
+    if x_dims != 2 and x_dims != 4:
+        input = reshape(input, shape=[n, c, 1, -1])
+        label = reshape(label, shape=[n, 1, -1])
+        out_shape = [n] + x_shape[2:]
+
+    if not in_dygraph_mode():
+        fluid.data_feeder.check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'nll_loss')
+        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
+                                                   'nll_loss')
+    inputs = {'X': input, 'Label': label}
+    attrs = {'reduction': reduction, 'ignore_index': ignore_index}
+    if weight is not None:
+        if isinstance(weight, Variable):
+            inputs['Weight'] = weight
+
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    total_weight = helper.create_variable_for_type_inference(dtype=input.dtype)
+    outputs = {'Out': out, 'Total_weight': total_weight}
+
+    helper.append_op(
+        type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs)
+    if x_dims != 2 and x_dims != 4 and reduction == 'none':
+        out = reshape(out, shape=out_shape)
+
+    return out
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 0b007041b4ab336ae355f5d338a0d7dca9b5380e..e9c1a21ecffb1b64cb5ae9e6b802600625cb4685 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -18,16 +18,19 @@ import paddle.fluid as fluid
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode, core
+from ...framework import create_parameter
 from ...fluid.layers import l2_normalize  #DEFINE_ALIAS
 from ...fluid.layers import lrn  #DEFINE_ALIAS
+from ...fluid.initializer import Constant
+from ...fluid.param_attr import ParamAttr
+from ...fluid import core, dygraph_utils
 
 __all__ = [
-    #       'batch_norm',
+    'batch_norm',
     #       'data_norm',
-    #       'group_norm',
-    #       'instance_norm',
+    'instance_norm',
     'l2_normalize',
-    #       'layer_norm',
+    'layer_norm',
     'lrn',
     'normalize',
     #       'spectral_norm'
@@ -51,8 +54,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     Args:
         x (Tensor): The input tensor could be N-D tensor, and the input data type could be float32 or float64.
         p (float|int, optional): The exponent value in the norm formulation. Default: 2
-        axis (int, optional): The axis on which to apply normalization. If ``x`` is 1-D tensor, ``axis`` is fixed to 0. If `axis < 0`, \
-            the dimension to normalization is `x.ndim + axis`. -1 is the last dimension.
+        axis (int, optional): The axis on which to apply normalization. If `axis < 0`, the dimension to normalization is `x.ndim + axis`. -1 is the last dimension. 
         epsilon (float, optional): Small float added to denominator to avoid dividing by zero. Default is 1e-12.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -69,7 +71,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
 
             paddle.disable_static()
             x = np.arange(6, dtype=np.float32).reshape(2,3)
-            x = paddle.to_variable(x)
+            x = paddle.to_tensor(x)
             y = F.normalize(x)
             print(y.numpy())
             # [[0.         0.4472136  0.8944272 ]
@@ -85,8 +87,6 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
             # [[0.         0.24253564 0.37139067]
             # [1.         0.97014254 0.9284767 ]]
     """
-    if len(x.shape) == 1:
-        axis = 0
     if in_dygraph_mode():
         eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
         out = core.ops.p_norm(x, 'axis', axis, 'porder',
@@ -96,6 +96,10 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     check_type(p, 'p', (float, int), 'normalize')
     check_type(axis, 'axis', (int), 'normalize')
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'normalize')
+    if len(x.shape) == 1 and axis != 0 and axis != -1:
+        raise ValueError(
+            "Axis must be 0 or -1 when x is a 1-D tensor, but received axis = {}".
+            format(axis))
 
     attrs = {
         'axis': axis,
@@ -110,3 +114,286 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     eps = out.block.create_var(dtype=out.dtype)
     paddle.fill_constant([1], out.dtype, epsilon, out=eps)
     return paddle.elementwise_div(x, paddle.maximum(out, eps), name=name)
+
+
+def batch_norm(x,
+               running_mean,
+               running_var,
+               weight,
+               bias,
+               training=False,
+               momentum=0.9,
+               epsilon=1e-05,
+               data_format="NCHW",
+               name=None):
+    """
+    Applies Batch Normalization as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    nn.functional.batch_norm is uesd for nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d. Please use above API for BatchNorm.
+    
+    Parameters:
+        x(Tesnor): input value. It's data type should be float32, float64.
+        running_mean(Tensor): running mean.
+        running_var(Tensor): running variance.
+        weight(Tensor): The weight tensor of batch_norm, can not be None.
+        bias(Tensor): The bias tensor of batch_norm can not be None. 
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Defalut False.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          x = np.random.seed(123)
+          x = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+          running_mean = np.random.random(size=1).astype('float32')
+          running_variance = np.random.random(size=1).astype('float32')
+          weight_data = np.random.random(size=1).astype('float32')
+          bias_data = np.random.random(size=1).astype('float32')
+          x = paddle.to_tensor(x)
+          rm = paddle.to_tensor(running_mean)
+          rv = paddle.to_tensor(running_variance)
+          w = paddle.to_tensor(weight_data)
+          b = paddle.to_tensor(bias_data)
+          batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv, w, b)
+          print batch_norm_out
+    """
+
+    assert len(x.shape) >= 2, "input dim must be larger than 1"
+
+    # we use not training means use_global_status, more details see nn._BatchNormBase
+    use_global_stats = not training
+    # input ad out must share the memory
+    mean_out = running_mean
+    variance_out = running_var
+
+    if in_dygraph_mode():
+        # for dygraph need tuple
+        attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
+                 data_format, "use_mkldnn", False, "fuse_with_relu", False,
+                 "use_global_stats", use_global_stats)
+        batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
+            x, weight, bias, running_mean, running_var, mean_out, variance_out,
+            *attrs)
+
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=None)
+
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             'BatchNorm')
+
+    # for static need dict
+    attrs = {
+        "momentum": momentum,
+        "epsilon": epsilon,
+        "data_layout": data_format,
+        "use_mkldnn": False,
+        "fuse_with_relu": False,
+        "use_global_stats": use_global_stats,
+    }
+
+    inputs = {
+        "X": [x],
+        "Scale": [weight],
+        "Bias": [bias],
+        "Mean": [running_mean],
+        "Variance": [running_var]
+    }
+
+    helper = LayerHelper('batch_norm', **locals())
+
+    dtype = x.dtype if x.dtype is not 'float16' else 'float32'
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {
+        "Y": [batch_norm_out],
+        "MeanOut": [running_mean],
+        "VarianceOut": [running_var],
+        "SavedMean": [saved_mean],
+        "SavedVariance": [saved_variance]
+    }
+
+    helper.append_op(
+        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+
+    return helper.append_activation(batch_norm_out)
+
+
+def layer_norm(x,
+               normalized_shape,
+               weight=None,
+               bias=None,
+               epsilon=1e-05,
+               name=None):
+    """
+    see more detail in paddle.nn.LayerNorm
+    
+    Parameters:
+        x(Tensor): Input Tensor. It's data type should be float32, float64.
+        normalized_shape(int|list|tuple): Input shape from an expected input of
+            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
+            If it is a single integer, this module will normalize over the last dimension
+            which is expected to be of that specific size.
+        epsilon(float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+        weight(Tensor, optional): The weight tensor of batch_norm. Default: None.
+        bias(Tensor, optional): The bias tensor of batch_norm. Default: None.
+        name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          layer_norm = paddle.nn.functional.layer_norm(x, x.shape[1:])
+          layer_norm_out = layer_norm(x)
+
+          print(layer_norm_out.numpy)
+    """
+    input_shape = list(x.shape)
+    input_ndim = len(input_shape)
+    normalized_ndim = len(normalized_shape)
+    begin_norm_axis = input_ndim - normalized_ndim
+    if input_ndim < normalized_ndim or input_shape[
+            begin_norm_axis:] != normalized_shape:
+        str_normalized_shape = str(normalized_shape)
+        raise ValueError('Given normalized_shape is ' + str_normalized_shape +
+                         ', expected input with shape [*, ' +
+                         str_normalized_shape[
+                             1:] + ', but got input shape ' + str(input_shape))
+
+    if in_dygraph_mode():
+        pre_act, _, _ = core.ops.layer_norm(x, weight, bias, 'epsilon', epsilon,
+                                            'begin_norm_axis', begin_norm_axis)
+        return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
+
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'LayerNorm')
+
+    inputs = dict()
+    inputs['X'] = [x]
+    if weight:
+        inputs['Scale'] = [weight]
+    if bias:
+        inputs['Bias'] = [bias]
+    attrs = {"epsilon": epsilon, "begin_norm_axis": begin_norm_axis}
+
+    # create output
+    helper = LayerHelper('layer_norm', **locals())
+    mean_out = helper.create_variable_for_type_inference(
+        dtype=x.type, stop_gradient=True)
+    variance_out = helper.create_variable_for_type_inference(
+        dtype=x.type, stop_gradient=True)
+    layer_norm_out = helper.create_variable_for_type_inference(x.type)
+
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "begin_norm_axis": begin_norm_axis})
+
+    return helper.append_activation(layer_norm_out)
+
+
+def instance_norm(x,
+                  running_mean=None,
+                  running_var=None,
+                  weight=None,
+                  bias=None,
+                  use_input_stats=True,
+                  momentum=0.9,
+                  eps=1e-05,
+                  data_format="NCHW",
+                  name=None):
+    """
+    See more detail in nn.layer.InstanceNorm2d.
+
+    Parameters:
+        x(Tensor): Input Tensor. It's data type should be float32, float64.
+        running_mean(Tensor): running mean. Default None.
+        running_var(Tensor): running variance. Default None.
+        weight(Tensor, optional): The weight tensor of instance_norm. Default: None.
+        bias(Tensor, optional): The bias tensor of instance_norm. Default: None.
+        eps(float, optional): A value added to the denominator for numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        use_input_stats(bool): Default True.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None.
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm_out = paddle.nn.functional.instancenorm(x)
+
+          print(instance_norm_out.numpy)
+
+    """
+
+    if in_dygraph_mode():
+        out, _, _ = core.ops.instance_norm(x, weight, bias, "epsilon", eps,
+                                           "momentum", momentum, "data_format",
+                                           data_format)
+        return out
+
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], "InstanceNorm")
+
+    attrs = {"epsilon": eps, "momentum": momentum, "data_format": data_format}
+
+    if weight and bias:
+        inputs = {"X": [x], "Scale": [weight], "Bias": [bias]}
+    else:
+        inputs = {"X": [x]}
+
+    helper = LayerHelper('instance_norm', **locals())
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    instance_norm_out = helper.create_variable_for_type_inference(x.dtype)
+
+    outputs = {
+        "Y": [instance_norm_out],
+        "SavedMean": [saved_mean],
+        "SavedVariance": [saved_variance]
+    }
+
+    helper.append_op(
+        type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+    return instance_norm_out
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
old mode 100644
new mode 100755
index 618145fb1fad47e2105edf6186bb4606494d57c9..ca657b8be3e67c7acb795a0f427ca5fe2c57b1f2
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -17,5 +17,1447 @@ from ...fluid.layers import pool2d  #DEFINE_ALIAS
 from ...fluid.layers import pool3d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool2d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool3d  #DEFINE_ALIAS
+from ...fluid import core
+from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
+from ...fluid.layers import utils, LayerHelper
+from ...fluid.data_feeder import check_type, check_variable_and_dtype, check_type, check_dtype, convert_dtype
+from ...fluid.layers import unsqueeze, squeeze
 
-__all__ = ['pool2d', 'pool3d', 'adaptive_pool2d', 'adaptive_pool3d']
+__all__ = [
+    'pool2d',
+    'pool3d',
+    'avg_pool1d',
+    'max_pool1d',
+    'adaptive_avg_pool1d',
+    'adaptive_max_pool1d',
+    'adaptive_avg_pool2d',
+    'adaptive_avg_pool3d',
+    'adaptive_pool2d',
+    'adaptive_pool3d',
+    'max_pool2d',
+    'avg_pool2d',
+    'max_pool3d',
+    'avg_pool3d',
+]
+
+
+def check_input(x, dimension):
+    if len(x.shape) != dimension:
+        raise ValueError("Excepted Input X is 3-D tensor, but received {}-D {}".
+                         format(len(x.shape), type(x)))
+
+
+def check_instance(x, x_name, types=(int, float)):
+
+    if not isinstance(x, types):
+        raise ValueError("Excepted {} type for {} but received type: {}. ".
+                         format(types, x_name, type(x)))
+
+
+def update_padding1d(padding, pool_type='avg'):
+    def is_list_or_tuple(ele):
+        if isinstance(ele, list) or isinstance(ele, tuple):
+            return True
+        return False
+
+    if is_list_or_tuple(padding):
+        if padding.__len__() == 1 and not is_list_or_tuple(padding[0]):
+            return [0, padding[0]]
+        else:
+            raise ValueError(
+                "{}_pool1d() argument 'padding' should contain one int (got {})".
+                format(pool_type, padding.__len__()))
+    else:
+        padding = [0, padding]
+
+    return padding
+
+
+def update_padding2d(padding, data_format):
+    def is_list_or_tuple(ele):
+        if isinstance(ele, list) or isinstance(ele, tuple):
+            return True
+        return False
+
+    if is_list_or_tuple(padding) and len(padding) == 4:
+        if is_list_or_tuple(padding[0]) and (data_format == "NCHW"):
+            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+                raise ValueError(
+                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                    "is not supported." % str(padding))
+            padding = padding[2:4]
+            padding = [ele for a_list in padding for ele in a_list]
+        elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
+            if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
+                raise ValueError(
+                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                    "is not supported." % str(padding))
+            padding = padding[1:3]
+            padding = [ele for a_list in padding for ele in a_list]
+        padding = utils.convert_to_list(padding, 4, 'padding')
+
+        if utils._is_symmetric_padding(padding, 2):
+            padding = [padding[0], padding[2]]
+    else:
+        padding = utils.convert_to_list(padding, 2, 'padding')
+
+    return padding
+
+
+def update_padding3d(padding, data_format):
+    def is_list_or_tuple(ele):
+        if isinstance(ele, (list, tuple)):
+            return True
+        return False
+
+    if is_list_or_tuple(padding) and len(padding) == 5:
+        if is_list_or_tuple(padding[0]) and (data_format == "NCDHW"):
+            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+                raise ValueError(
+                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                    "is not supported." % str(padding))
+            padding = padding[2:5]
+            padding = [ele for a_list in padding for ele in a_list]
+        elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
+            if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
+                raise ValueError(
+                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                    "is not supported." % str(padding))
+            padding = padding[1:4]
+            padding = [ele for a_list in padding for ele in a_list]
+        padding = utils.convert_to_list(padding, 6, 'padding')
+        if utils._is_symmetric_padding(padding, 3):
+            padding = [padding[0], padding[2], padding[4]]
+
+    elif is_list_or_tuple(padding) and len(padding) == 6:
+        padding = utils.convert_to_list(padding, 6, 'padding')
+        if utils._is_symmetric_padding(padding, 3):
+            padding = [padding[0], padding[2], padding[4]]
+    else:
+        padding = utils.convert_to_list(padding, 3, 'padding')
+
+    return padding
+
+
+def avg_pool1d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               count_include_pad=True,
+               ceil_mode=False,
+               name=None):
+    """
+
+    This operation applies a 1D average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
+
+    ..  math::
+
+       Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k])
+
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
+                          shape [N, C, L]. where `N` is batch size, `C` is the number of channels,
+                          `L` is the length of the feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one integers.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain one integers.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
+            then the input is implicitly zero-padded on both sides for padding number of points.
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
+            If it is set to False, the floor function will be used. Default False
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn.functional as F
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          pool_out = F.avg_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # pool_out shape: [1, 3, 16]
+
+    """
+    """NCL to NCHW"""
+    data_format = "NCHW"
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'avg_pool1d')
+    check_input(x, 3)
+    x = unsqueeze(x, [2])
+    kernel_size = utils.convert_to_list(kernel_size, 1, 'pool_size')
+    kernel_size = [1] + kernel_size
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 1, 'pool_stride')
+        stride = [1] + stride
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0]
+
+    padding = update_padding1d(padding, "avg")
+
+    if in_dygraph_mode():
+        output = core.ops.pool2d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
+            False, 'strides', stride, 'paddings', padding, 'padding_algorithm',
+            padding_algorithm, 'use_cudnn', not count_include_pad, 'ceil_mode',
+            ceil_mode, 'use_mkldnn', False, 'exclusive', True, 'data_format',
+            data_format)
+        return squeeze(output, [2])
+
+    op_type = 'pool2d'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": 'avg',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": not count_include_pad,
+            "data_format": data_format,
+        })
+
+    return squeeze(pool_out, [2])
+
+
+def max_pool1d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               return_indices=False,
+               ceil_mode=False,
+               name=None):
+    """
+
+    Applies a 1D max pooling over an input signal composed of several input planes based
+    on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
+
+    ..  math::
+
+       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
+                          shape [N, C, L], where `N` is batch size, `C` is the number of channels,
+                          `L` is the length of the feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one integers.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain one integers.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be the following forms: `[pad_left, pad_right]`.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn.functional as F
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # pool_out shape: [1, 3, 16]
+
+          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_indices=True)
+          # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
+
+    """
+    """NCL to NCHW"""
+    data_format = "NCHW"
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'max_pool1d')
+    check_input(x, 3)
+    x = unsqueeze(x, [2])
+    kernel_size = [1] + utils.convert_to_list(kernel_size, 1, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0]
+
+    padding = update_padding1d(padding, 'max')
+
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
+            'paddings', padding, 'padding_algorithm', padding_algorithm,
+            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
+            'exclusive', True, 'data_format', data_format)
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+
+    op_type = 'max_pool2d_with_index'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": True,
+            "data_format": data_format,
+        })
+
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+
+
+def adaptive_avg_pool1d(x, output_size, name=None):
+    """
+
+    This operation applies a 1D adaptive average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For average adaptive pool1d:
+
+    ..  math::
+
+        lstart &= floor(i * L_{in} / L_{out})
+
+        lend &= ceil((i + 1) * L_{in} / L_{out})
+
+        Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+
+    Args:
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
+        name(str, optional): For detailed information, please refer
+                                 to :ref:`api_guide_Name`. Usually name is no need to set and
+                                 None by default.
+
+    Returns:
+            Tensor: The output tensor of adaptive average pooling result. The data type is same
+                      as input tensor.
+
+    Raises:
+            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
+
+    Examples:
+        .. code-block:: python
+
+              # average adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+              #
+              import paddle
+              import paddle.nn.functional as F
+              paddle.disable_static()
+
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_average_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
+    """
+    pool_type = 'avg'
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
+                             'adaptive_pool2d')
+    check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
+
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+
+    l_type = "pool2d"
+    x = unsqueeze(x, [2])
+    if in_dygraph_mode():
+        pool_out = core.ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
+                                   pool_size, 'adaptive', True)
+        return squeeze(pool_out, [2])
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {"Out": pool_out}
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return squeeze(pool_out, [2])
+
+
+def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
+    """
+    This operation applies a 1D adaptive max pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For max adaptive pool1d:
+
+    ..  math::
+
+        lstart &= floor(i * L_{in} / L_{out})
+
+        lend &= ceil((i + 1) * L_{in} / L_{out})
+
+        Output(i) &= max(Input[lstart:lend])}
+
+    Args:
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+                with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                                 to :ref:`api_guide_Name`. Usually name is no need to set and
+                                 None by default.
+
+    Returns:
+            Tensor: The output tensor of adaptive pooling result. The data type is same
+                      as input tensor.
+
+    Raises:
+            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
+
+    Examples:
+        .. code-block:: python
+
+              # max adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = max(input[:, :, lstart: lend])
+              #
+              import paddle
+              import paddle.nn.functional as F
+              paddle.disable_static()
+
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_max_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
+
+              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_indices=True)
+              # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
+
+    """
+    pool_type = 'max'
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
+                             'adaptive_max_pool1d')
+    check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
+
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+
+    l_type = 'max_pool2d_with_index'
+
+    x = unsqueeze(x, [2])
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+
+
+def max_pool2d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               return_indices=False,
+               ceil_mode=False,
+               data_format="NCHW",
+               name=None):
+    """
+    This operation applies 2D max pooling over input feature based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+           stride: stride
+
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
+           $$
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
+                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
+                          `H` is the height of the feature, and `W` is the width of the
+                          feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Otherwise, the pool padding size will be a square of an int.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          output = F.max_pool2d(input,
+                                kernel_size=2,
+                                stride=2, padding=0)
+          # output.shape [1, 3, 16, 16]
+
+          # for return_indices=True
+          output, max_indices = F.max_pool2d(input,
+                                             kernel_size=2,
+                                             stride=2,
+                                             padding=0,
+                                             return_indices=True)
+          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool2d')
+    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 2, 'pool_stride')
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0]
+
+    padding = update_padding2d(padding, data_format)
+
+    if in_dygraph_mode():
+        output = core.ops.max_pool2d_with_index(
+            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
+            'paddings', padding, 'padding_algorithm', padding_algorithm,
+            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
+            'exclusive', True, 'data_format', data_format)
+        return output if return_indices else output[0]
+
+    op_type = 'max_pool2d_with_index'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": True,
+            "data_format": data_format,
+        })
+
+    return (pool_out, mask) if return_indices else pool_out
+
+
+def avg_pool2d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               ceil_mode=False,
+               count_include_pad=True,
+               divisor_override=None,
+               data_format="NCHW",
+               name=None):
+    """
+    This operation applies 2D average pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           $$
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
+                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
+                          `H` is the height of the feature, and `W` is the width of the
+                          feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Otherwise, the pool padding size will be a square of an int.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          paddle.disable_static()
+
+          # avg pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          output = F.avg_pool2d(input,
+                                kernel_size=2,
+                                stride=2, padding=0)
+          # output.shape [1, 3, 16, 16]
+
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool2d')
+    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 2, 'pool_stride')
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0]
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+    pool_padding = update_padding2d(padding, data_format)
+
+    if in_dygraph_mode():
+        output = core.ops.pool2d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
+            False, 'padding_algorithm', padding_algorithm, 'strides', stride,
+            'paddings', pool_padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1]) / divisor_override
+
+    op_type = 'pool2d'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": "avg",
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": pool_padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": not count_include_pad,
+            "data_format": data_format,
+        })
+
+    if divisor_override is None:
+        return pool_out
+    else:
+        check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
+
+
+def max_pool3d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               return_indices=False,
+               ceil_mode=False,
+               data_format="NCDHW",
+               name=None):
+    """
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, D_{in}, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+
+      Output:
+           Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+           $$
+           \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, ksize[0]-1} \max_{m=0, \ldots, ksize[1]-1} \max_{n=0, \ldots, ksize[2]-1} \\
+                                              & \text{input}(N_i, C_j, \text{stride[0]} \times d + k,
+                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
+           $$
+
+    Args:
+        x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
+                          shape [N, C, D, H, W]. The format of
+                          input tensor is `"NCDHW"` or `"NDHWC"`, where `N` is batch size, `C` is
+                          the number of channels, `D` is the depth of the feature,
+                          `H` is the height of the feature, and `W` is the width
+                          of the feature.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        ceil_mode (bool): ${ceil_mode_comment}
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output = F.max_pool2d(input,
+                                kernel_size=2,
+                                stride=2, padding=0)
+          output.shape [1, 3, 16, 16, 16]
+
+          # for return_indices=True
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output, max_indices = paddle.nn.functional.max_pool3d(input,
+                                        kernel_size = 2,
+                                        stride = 2,
+                                        padding=0,
+                                        return_indices=True)
+          # output.shape [None, 3, 16, 16, 16], max_indices.shape [None, 3, 16, 16, 16],
+
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
+    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 3, 'pool_stride')
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0, 0]
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s" % str(data_format))
+    padding = update_padding3d(padding, data_format)
+
+    if in_dygraph_mode():
+        output = core.ops.max_pool3d_with_index(
+            x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides', stride,
+            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', True, 'data_format', data_format)
+        return output if return_indices else output[0]
+
+    op_type = "max_pool3d_with_index"
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": False,
+            "data_format": data_format,
+        })
+
+    return (pool_out, mask) if return_indices else pool_out
+
+
+def avg_pool3d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               ceil_mode=False,
+               count_include_pad=False,
+               divisor_override=None,
+               data_format="NCDHW",
+               name=None):
+    """
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    Args:
+        input (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
+                          shape [N, C, D, H, W], where `N` is batch size, `C` is
+                          the number of channels, `D` is the depth of the feature,
+                          `H` is the height of the feature, and `W` is the width
+                          of the feature.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          import paddle
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          # avg pool3d
+          pool3d = paddle.nn.functional.avg_pool3d(
+                                            input,
+                                            kernel_size = 2,
+                                            stride = 2,
+                                            padding=0)
+          # pool3d.shape: [1, 3, 16, 16, 16]
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
+    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 3, 'pool_stride')
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0, 0]
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s" % str(data_format))
+    padding = update_padding3d(padding, data_format)
+
+    if in_dygraph_mode():
+        output = core.ops.pool3d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
+            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1] *
+                             kernel_size[2]) / divisor_override
+
+    op_type = "pool3d"
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out}
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'avg',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": not count_include_pad,
+            "data_format": data_format,
+        })
+
+    if divisor_override is None:
+        return pool_out
+    else:
+        check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1] *
+                           kernel_size[2]) / divisor_override
+
+
+def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
+    """
+
+    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool2d` .
+
+    For avg adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+
+    Args:
+        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor.
+                          The data type can be float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two element, (H, W). H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
+            the order of: [batch_size, input_channels, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
+
+    Raises:
+        ValueError: If `data_format` is not "NCHW" or "NHWC".
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+            #
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 32, 32]
+            pool_out = paddle.nn.functional.adaptive_avg_pool2d(
+                            x = x,
+                            output_size=[3, 3])
+            # pool_out.shape is [2, 3, 3, 3]
+    """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_avg_pool2d')
+    check_type(data_format, 'data_format', str, 'adaptive_avg_pool2d')
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    if data_format == "NCHW":
+        in_h, in_w = x.shape[2:4]
+    else:
+        in_h, in_w = x.shape[1:3]
+
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 2, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_h
+        if output_size[1] == None:
+            output_size[1] = in_w
+
+    if in_dygraph_mode():
+        output = core.ops.pool2d(x, 'pooling_type', 'avg', 'ksize', output_size,
+                                 'global_pooling', False, 'adaptive', True,
+                                 'data_format', data_format)
+        return output
+
+    l_type = 'pool2d'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {"Out": pool_out}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": "avg",
+            "ksize": output_size,
+            "adaptive": True,
+            "data_format": data_format,
+        })
+
+    return pool_out
+
+
+def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
+    """
+
+    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool3d` .
+
+    For avg adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+
+    Args:
+        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
+                          The data type can be float32, float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
+            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
+
+    Raises:
+        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 8, 32, 32]
+            pool_out = paddle.nn.functional.adaptive_avg_pool3d(
+                            x = x,
+                            output_size=[3, 3, 3])
+            # pool_out.shape is [2, 3, 3, 3, 3]
+    """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_avg_pool3d')
+    check_type(data_format, 'data_format', str, 'adaptive_avg_pool3d')
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    if data_format == "NCDHW":
+        in_l, in_h, in_w = x.shape[2:5]
+    else:
+        in_l, in_h, in_w = x.shape[1:4]
+
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 3, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_l
+        if output_size[1] == None:
+            output_size[1] = in_h
+        if output_size[2] == None:
+            output_size[2] = in_w
+
+    if in_dygraph_mode():
+        output = core.ops.pool3d(x, 'pooling_type', 'avg', 'ksize', output_size,
+                                 'global_pooling', False, 'adaptive', True,
+                                 'data_format', data_format)
+        return output
+
+    l_type = 'pool3d'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": "avg",
+            "ksize": output_size,
+            "adaptive": True,
+            "data_format": data_format,
+        })
+
+    return pool_out
diff --git a/python/paddle/nn/functional/rnn.py b/python/paddle/nn/functional/rnn.py
index 520cf44360dc370052375c2c9ef3d0b00fbc05de..b7a97bc5aa303ca507cac37798a3625d498050e3 100644
--- a/python/paddle/nn/functional/rnn.py
+++ b/python/paddle/nn/functional/rnn.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define function of recurrent neural network  
+from paddle.fluid.layers.rnn import rnn, birnn
 
-__all__ = [
-    #       'gru_unit',
-    #       'lstm',
-    #       'lstm_unit'
-]
+__all__ = ['rnn', 'birnn']
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index a2cc8fde5ad7147b7af4765de834508f1f3cc825..1dfdac26e990851ac5f192742acd47fb92633d0d 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -12,9 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from ...device import get_cudnn_version
+from ...fluid.framework import core, in_dygraph_mode, Variable
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.data_feeder import check_variable_and_dtype
+from ...fluid import dygraph_utils
+import numpy as np
+
 # TODO: define specitial functions used in computer vision task  
 from ...fluid.layers import affine_channel  #DEFINE_ALIAS
-from ...fluid.layers import affine_grid  #DEFINE_ALIAS
 from ...fluid.layers import anchor_generator  #DEFINE_ALIAS
 from ...fluid.layers import bipartite_match  #DEFINE_ALIAS
 from ...fluid.layers import box_clip  #DEFINE_ALIAS
@@ -44,7 +50,7 @@ from ...fluid.layers import yolov3_loss  #DEFINE_ALIAS
 
 from ...fluid.layers import fsp_matrix  #DEFINE_ALIAS
 from ...fluid.layers import image_resize_short  #DEFINE_ALIAS
-from ...fluid.layers import pixel_shuffle  #DEFINE_ALIAS
+# from ...fluid.layers import pixel_shuffle  #DEFINE_ALIAS
 from ...fluid.layers import retinanet_detection_output  #DEFINE_ALIAS
 from ...fluid.layers import retinanet_target_assign  #DEFINE_ALIAS
 from ...fluid.layers import roi_perspective_transform  #DEFINE_ALIAS
@@ -89,3 +95,313 @@ __all__ = [
     'yolo_box',
     'yolov3_loss'
 ]
+
+
+def affine_grid(theta, out_shape, align_corners=True, name=None):
+    """
+    It generates a grid of (x,y) coordinates using the parameters of
+    the affine transformation that correspond to a set of points where
+    the input feature map should be sampled to produce the transformed
+    output feature map.
+
+    Args:
+        theta (Tensor) - A tensor with shape [N, 2, 3]. It contains a batch of affine transform parameters.
+                           The data type can be float32 or float64.
+        out_shape (Tensor | list | tuple): The shape of target output with format [batch_size, channel, height, width].
+                                             ``out_shape`` can be a Tensor or a list or tuple. The data
+                                             type must be int32.
+        align_corners(bool): Whether to align corners of target feature map and source feature map. Default: True.
+        name(str|None): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, A Tensor with shape [batch_size, H, W, 2] while 'H' and 'W' are the height and width of feature map in affine transformation. The data type is the same as `theta`.
+
+    Raises:
+        ValueError: If the type of arguments is not supported.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+            paddle.disable_static()
+            # theta shape = [1, 2, 3]
+            theta = np.array([[[-0.7, -0.4, 0.3],
+                               [ 0.6,  0.5, 1.5]]]).astype("float32")
+            theta_t = paddle.to_tensor(theta)
+            y_t = F.affine_grid(
+                    theta_t,
+                    [1, 2, 3, 3],
+                    align_corners=False)
+            print(y_t.numpy())
+            
+            #[[[[ 1.0333333   0.76666665]
+            #   [ 0.76666665  1.0999999 ]
+            #   [ 0.5         1.4333333 ]]
+            #
+            #  [[ 0.5666667   1.1666666 ]
+            #   [ 0.3         1.5       ]
+            #   [ 0.03333333  1.8333334 ]]
+            #
+            #  [[ 0.10000002  1.5666667 ]
+            #   [-0.16666666  1.9000001 ]
+            #   [-0.43333334  2.2333333 ]]]]
+    """
+    helper = LayerHelper('affine_grid')
+
+    if not isinstance(theta, Variable):
+        raise ValueError("The theta should be a Tensor.")
+    check_variable_and_dtype(theta, 'theta', ['float32', 'float64'],
+                             'affine_grid')
+    cudnn_version = get_cudnn_version()
+    if cudnn_version is not None and cudnn_version >= 6000 and align_corners:
+        use_cudnn = True
+    else:
+        use_cudnn = False
+
+    if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
+            isinstance(out_shape, Variable)):
+        raise ValueError("The out_shape should be a list, tuple or Tensor.")
+
+    if in_dygraph_mode():
+        _out_shape = out_shape.numpy().tolist() if isinstance(
+            out_shape, Variable) else out_shape
+        return core.ops.affine_grid(theta, "output_shape", _out_shape,
+                                    "align_corners", align_corners, "use_cudnn",
+                                    use_cudnn)
+
+    out = helper.create_variable_for_type_inference(theta.dtype)
+    ipts = {'Theta': theta}
+    attrs = {"align_corners": align_corners, "use_cudnn": use_cudnn}
+    if isinstance(out_shape, Variable):
+        ipts['OutputShape'] = out_shape
+        check_variable_and_dtype(out_shape, 'out_shape', ['int32'],
+                                 'affine_grid')
+    else:
+        attrs['output_shape'] = out_shape
+
+    helper.append_op(
+        type='affine_grid',
+        inputs=ipts,
+        outputs={'Output': out},
+        attrs=None if len(attrs) == 0 else attrs)
+    return out
+
+
+def grid_sample(x,
+                grid,
+                mode='bilinear',
+                padding_mode='zeros',
+                align_corners=True,
+                name=None):
+    """
+    This operation samples input X by using bilinear interpolation or
+    nearest interpolation based on flow field grid, which is usually
+    generated by :code:`affine_grid` . The grid of shape [N, H, W, 2]
+    is the concatenation of (x, y) coordinates with shape [N, H, W] each,
+    where x is indexing the 4th dimension (in width dimension) of input
+    data x and y is indexing the 3rd dimension (in height dimension),
+    finally results is the bilinear interpolation or nearest value of 4 nearest corner
+    points. The output tensor shape will be [N, C, H, W].
+    .. code-block:: text
+        Step 1:
+        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+        .. code-block:: text
+            grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+            grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+        Step 2:
+        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
+        interpolate point value by 4 nearest points or nearest interpolate point value
+        by nearest point.
+          wn ------- y_n ------- en
+          |           |           |
+          |          d_n          |
+          |           |           |
+         x_w --d_w-- grid--d_e-- x_e
+          |           |           |
+          |          d_s          |
+          |           |           |
+          ws ------- y_s ------- wn
+        For bilinear interpolation:
+        x_w = floor(x)              // west side x coord
+        x_e = x_w + 1               // east side x coord
+        y_n = floor(y)              // north side y coord
+        y_s = y_s + 1               // south side y coord
+        d_w = grid_x - x_w          // distance to west side
+        d_e = x_e - grid_x          // distance to east side
+        d_n = grid_y - y_n          // distance to north side
+        d_s = y_s - grid_y          // distance to south side
+        wn = X[:, :, y_n, x_w]      // north-west point value
+        en = X[:, :, y_n, x_e]      // north-east point value
+        ws = X[:, :, y_s, x_w]      // south-east point value
+        es = X[:, :, y_s, x_w]      // north-east point value
+        output = wn * d_e * d_s + en * d_w * d_s
+               + ws * d_e * d_n + es * d_w * d_n
+    Args:
+        x(Tensor): The input tensor, which is a 4-d tensor with shape
+                     [N, C, H, W], N is the batch size, C is the channel
+                     number, H and W is the feature height and width.
+                     The data type is float32 or float64.
+        grid(Tensor): Input grid tensor of shape [N, grid_H, grid_W, 2]. The
+                        data type is float32 or float64.
+        mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
+                         Default: 'bilinear'.
+        padding_mode(str, optional) The padding method used when source index
+                   is out of input images. It can be 'zeros', 'reflect' and 'border'.
+                   Default: zeros.
+        align_corners(bool, optional): If `align_corners` is true, it will projects
+                   -1 and 1 to the centers of the corner pixels. Otherwise, it will
+                   projects -1 and 1 to the image edges.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor, The shape of output is [N, C, grid_H, grid_W] in which `grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+            
+            # shape=[1, 1, 3, 3]
+            x = np.array([[[[-0.6,  0.8, -0.5],
+                            [-0.5,  0.2,  1.2],
+                            [ 1.4,  0.3, -0.2]]]]).astype("float64")
+            
+            # grid shape = [1, 3, 4, 2]
+            grid = np.array(
+                         [[[[ 0.2,  0.3],
+                            [-0.4, -0.3],
+                            [-0.9,  0.3],
+                            [-0.9, -0.6]],
+                           [[ 0.4,  0.1],
+                            [ 0.9, -0.8],
+                            [ 0.4,  0.5],
+                            [ 0.5, -0.2]],
+                           [[ 0.1, -0.8],
+                            [-0.3, -1. ],
+                            [ 0.7,  0.4],
+                            [ 0.2,  0.8]]]]).astype("float64")
+            
+            paddle.disable_static()
+            x = paddle.to_tensor(x)
+            grid = paddle.to_tensor(grid)
+            y_t = F.grid_sample(
+                x,
+                grid,
+                mode='bilinear',
+                padding_mode='border',
+                align_corners=True)
+            print(y_t.numpy())
+            
+            # output shape = [1, 1, 3, 4]
+            # [[[[ 0.34   0.016  0.086 -0.448]
+            #    [ 0.55  -0.076  0.35   0.59 ]
+            #    [ 0.596  0.38   0.52   0.24 ]]]]
+    """
+    helper = LayerHelper("grid_sample", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sampler')
+    check_variable_and_dtype(grid, 'grid', ['float32', 'float64'],
+                             'grid_sampler')
+    if not isinstance(x, Variable):
+        raise ValueError("The x should be a Variable")
+    if not isinstance(grid, Variable):
+        raise ValueError("The grid should be a Variable")
+    _modes = ['bilinear', 'nearest']
+    _padding_modes = ['zeros', 'reflect', 'border']
+    if mode not in _modes:
+        raise ValueError(
+            "The mode of grid sample function should be in {}, but got: {}".
+            format(_modes, mode))
+    if padding_mode not in _padding_modes:
+        raise ValueError(
+            "The padding mode of grid sample function should be in {}, but got: {}".
+            format(_padding_modes, padding_mode))
+
+    if not isinstance(align_corners, bool):
+        raise ValueError("The align corners should be bool, but got: {}".format(
+            align_corners))
+
+    cudnn_version = get_cudnn_version()
+    use_cudnn = False
+    if (cudnn_version is not None
+        ) and align_corners and mode == 'bilinear' and padding_mode == 'zeros':
+        use_cudnn = True
+    ipts = {'X': x, 'Grid': grid}
+    attrs = {
+        'mode': mode,
+        'padding_mode': padding_mode,
+        'align_corners': align_corners,
+        'use_cudnn': use_cudnn
+    }
+
+    if in_dygraph_mode():
+        attrs = ('mode', mode, 'padding_mode', padding_mode, 'align_corners',
+                 align_corners, 'use_cudnn', use_cudnn)
+        out = getattr(core.ops, 'grid_sampler')(x, grid, *attrs)
+    else:
+        out = helper.create_variable_for_type_inference(x.dtype)
+        helper.append_op(
+            type='grid_sampler',
+            inputs=ipts,
+            attrs=attrs,
+            outputs={'Output': out})
+    return out
+
+
+def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
+    """
+    This API implements pixel shuffle operation.
+    See more details in :ref:`api_nn_vision_PixelShuffle` .
+    Parameters:
+        x(Tensor): 4-D tensor, the data type should be float32 or float64.
+        upscale_factor(int): factor to increase spatial resolution.
+        data_format (str): The data format of the input and output data. An optional string from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in the order of: [batch_size, input_channels, input_height, input_width].
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
+    Returns:
+        Out(tensor): Reshaped tensor according to the new dimension.
+    Raises:
+        ValueError: If the square of upscale_factor cannot divide the channels of input.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+            x = np.random.randn(2, 9, 4, 4).astype(np.float32)
+            paddle.disable_static()
+            x_var = paddle.to_tensor(x)
+            out_var = F.pixel_shuffle(x_var, 3)
+            out = out_var.numpy()
+            print(out.shape) 
+            # (2, 1, 12, 12)
+    """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'pixel_shuffle')
+
+    if not isinstance(upscale_factor, int):
+        raise TypeError("upscale factor must be int type")
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'."
+                         "But recevie Attr(data_format): {} ".format(
+                             data_format))
+
+    if in_dygraph_mode():
+        return core.ops.pixel_shuffle(x, "upscale_factor", upscale_factor,
+                                      "data_format", data_format)
+
+    helper = LayerHelper("pixel_shuffle", **locals())
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="pixel_shuffle",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"upscale_factor": upscale_factor,
+               "data_format": data_format})
+    return out
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index f64252da5428a09a5530eb1d2cec375c9141ea9a..3399e4e34c9e3bc61fde515fc1917deb213f3d0b 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -20,7 +20,10 @@ from . import conv
 from . import extension
 from . import activation
 from . import norm
+from . import rnn
+from . import vision
 from . import distance
+from . import transformer
 
 from .activation import *
 from .loss import *
@@ -28,6 +31,10 @@ from .conv import *
 from .extension import *
 from .activation import *
 from .norm import *
+from .rnn import *
+from .vision import *
+
+from .transformer import *
 # from .activation import PReLU        #DEFINE_ALIAS
 from .activation import ReLU  #DEFINE_ALIAS
 from .activation import LeakyReLU  #DEFINE_ALIAS
@@ -36,16 +43,45 @@ from .activation import Sigmoid  #DEFINE_ALIAS
 from .activation import LogSoftmax  #DEFINE_ALIAS
 from .activation import HSigmoid  #DEFINE_ALIAS
 from .common import BilinearTensorProduct  #DEFINE_ALIAS
+from .common import Bilinear  #DEFINE_ALIAS
 from .common import Pool2D  #DEFINE_ALIAS
 from .common import Pad2D  #DEFINE_ALIAS
+from .common import ReflectionPad1d  #DEFINE_ALIAS
+from .common import ReplicationPad1d  #DEFINE_ALIAS
+from .common import ConstantPad1d  #DEFINE_ALIAS
+from .common import ReflectionPad2d  #DEFINE_ALIAS
+from .common import ReplicationPad2d  #DEFINE_ALIAS
+from .common import ConstantPad2d  #DEFINE_ALIAS
+from .common import ZeroPad2d  #DEFINE_ALIAS
+from .common import ReplicationPad3d  #DEFINE_ALIAS
+from .common import ConstantPad3d  #DEFINE_ALIAS
+from .common import CosineSimilarity  #DEFINE_ALIAS
 from .common import Embedding  #DEFINE_ALIAS
 from .common import Linear  #DEFINE_ALIAS
 from .common import Flatten  #DEFINE_ALIAS
 from .common import UpSample  #DEFINE_ALIAS
-from .conv import Conv2D  #DEFINE_ALIAS
-from .conv import Conv2DTranspose  #DEFINE_ALIAS
-from .conv import Conv3D  #DEFINE_ALIAS
-from .conv import Conv3DTranspose  #DEFINE_ALIAS
+from .common import UpsamplingNearest2d  #DEFINE_ALIAS
+from .common import UpsamplingBilinear2d  #DEFINE_ALIAS
+from .common import Dropout  #DEFINE_ALIAS
+from .common import Dropout2D  #DEFINE_ALIAS
+from .common import Dropout3D  #DEFINE_ALIAS
+from .common import AlphaDropout  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+from .pooling import AvgPool1d  #DEFINE_ALIAS
+from .pooling import MaxPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
+from .pooling import AvgPool2d  #DEFINE_ALIAS
+from .pooling import MaxPool2d  #DEFINE_ALIAS
+from .pooling import AvgPool3d  #DEFINE_ALIAS
+from .pooling import MaxPool3d  #DEFINE_ALIAS
+from .conv import Conv1d  #DEFINE_ALIAS
+from .conv import Conv2d  #DEFINE_ALIAS
+from .conv import Conv3d  #DEFINE_ALIAS
+from .conv import ConvTranspose1d  #DEFINE_ALIAS
+from .conv import ConvTranspose2d  #DEFINE_ALIAS
+from .conv import ConvTranspose3d  #DEFINE_ALIAS
 # from .conv import TreeConv        #DEFINE_ALIAS
 # from .conv import Conv1D        #DEFINE_ALIAS
 from .extension import RowConv  #DEFINE_ALIAS
@@ -57,6 +93,7 @@ from .extension import RowConv  #DEFINE_ALIAS
 # from .learning_rate import PiecewiseDecay        #DEFINE_ALIAS
 # from .learning_rate import PolynomialDecay        #DEFINE_ALIAS
 # from .loss import NCELoss        #DEFINE_ALIAS
+from .loss import BCEWithLogitsLoss  #DEFINE_ALIAS
 from .loss import CrossEntropyLoss  #DEFINE_ALIAS
 from .loss import MSELoss  #DEFINE_ALIAS
 from .loss import L1Loss  #DEFINE_ALIAS
@@ -64,6 +101,8 @@ from .loss import NLLLoss  #DEFINE_ALIAS
 from .loss import BCELoss  #DEFINE_ALIAS
 from .loss import KLDivLoss  #DEFINE_ALIAS
 from .loss import MarginRankingLoss  #DEFINE_ALIAS
+from .loss import CTCLoss  #DEFINE_ALIAS
+from .loss import SmoothL1Loss  #DEFINE_ALIAS
 from .norm import BatchNorm  #DEFINE_ALIAS
 from .norm import SyncBatchNorm  #DEFINE_ALIAS
 from .norm import GroupNorm  #DEFINE_ALIAS
@@ -73,4 +112,6 @@ from .norm import InstanceNorm  #DEFINE_ALIAS
 # from .rnn import RNNCell        #DEFINE_ALIAS
 # from .rnn import GRUCell        #DEFINE_ALIAS
 # from .rnn import LSTMCell        #DEFINE_ALIAS
+
+from .vision import PixelShuffle  #DEFINE_ALIAS
 from .distance import PairwiseDistance  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 6b965564fc9c344183f715669335e018e80f6da4..c38d6018a2500111280a482aa60d072e65e27742 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -18,19 +18,30 @@ __all__ = [
     'ELU',
     'GELU',
     'Hardshrink',
-    #       'PReLU',
+    'Tanh',
+    'Hardtanh',
+    'PReLU',
     'ReLU',
+    'ReLU6',
+    'SELU',
     'LeakyReLU',
     'Sigmoid',
-    #       'Softmax',
+    'Softmax',
+    'Softplus',
+    'Softshrink',
+    'Softsign',
+    'Tanhshrink',
     'LogSigmoid',
     'LogSoftmax',
-    'HSigmoid'
+    'HSigmoid',
 ]
 
 from ...fluid.dygraph import layers
 from ...fluid import core
 from ...fluid.framework import in_dygraph_mode
+from ...fluid.param_attr import ParamAttr
+from ...fluid.initializer import Constant
+from paddle.framework import get_default_dtype
 from .. import functional as F
 
 
@@ -38,7 +49,7 @@ class ELU(layers.Layer):
     """
     ELU Activation.
 
-    ..  math::
+    .. math::
     
         ELU(x) = max(0, x) + min(0, \\alpha * (e^{x}-1))
 
@@ -54,16 +65,16 @@ class ELU(layers.Layer):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
-        m = paddle.nn.ELU(0.2)
-        out = m(x) 
-        # [[-0.12642411  6.        ]
-        #  [ 1.          15.6      ]]
+            x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
+            m = paddle.nn.ELU(0.2)
+            out = m(x)
+            # [[-0.12642411  6.        ]
+            #  [ 1.          15.6      ]]
     """
 
     def __init__(self, alpha=1.0, name=None):
@@ -81,13 +92,13 @@ class GELU(layers.Layer):
 
     If approximate is True
 
-    ..  math::
+    .. math::
 
         GELU(x) = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
 
     else
 
-    ..  math::
+    .. math::
 
         GELU(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
 
@@ -103,23 +114,18 @@ class GELU(layers.Layer):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        data = np.random.randn(2, 3).astype("float32")
-        x = paddle.to_tensor(data)
+            x = paddle.to_tensor(np.array([[-1, 0.5],[1, 1.5]]))
+            
+            m = paddle.nn.GELU()
+            out = m(x) # [-0.158655 0.345731 0.841345 1.39979]
 
-        m = paddle.nn.GELU()
-        out = m(x)
-
-        data
-        # array([[ 0.87165993, -1.0541513 , -0.37214822],
-        #         [ 0.15647964,  0.32496083,  0.33045998]], dtype=float32)
-        out
-        # array([[ 0.70456535, -0.15380788, -0.13207214],
-        #        [ 0.08796856,  0.20387867,  0.2080159 ]], dtype=float32)
+            m = paddle.nn.GELU(True)
+            out = m(x) # [-0.158808 0.345714 0.841192 1.39957]
     """
 
     def __init__(self, approximate=False, name=None):
@@ -138,13 +144,13 @@ class Hardshrink(layers.Layer):
     .. math::
 
         hardshrink(x)=
-            \left\{
-            \begin{aligned}
-            &x, & & if \ x > threshold \\
-            &x, & & if \ x < -threshold \\
-            &0, & & if \ others
-            \end{aligned}
-            \right.
+            \\left\\{
+            \\begin{aligned}
+            &x, & & if \\ x > threshold \\\\
+            &x, & & if \\ x < -threshold \\\\
+            &0, & & if \\ others
+            \\end{aligned}
+            \\right.
 
     Parameters:
         threshold (float, optional): The value of threshold for hardthrink. Default is 0.5
@@ -159,14 +165,14 @@ class Hardshrink(layers.Layer):
 
         .. code-block:: python
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        x = paddle.to_variable(np.array([-1, 0.3, 2.5]))
-        m = paddle.nn.Hardshrink()
-        out = m(x) # [-1., 0., 2.5]
+            x = paddle.to_tensor(np.array([-1, 0.3, 2.5]))
+            m = paddle.nn.Hardshrink()
+            out = m(x) # [-1., 0., 2.5]
     """
 
     def __init__(self, threshold=0.5, name=None):
@@ -178,6 +184,90 @@ class Hardshrink(layers.Layer):
         return F.hardshrink(x, self._threshold, self._name)
 
 
+class Tanh(layers.Layer):
+    """
+    Tanh Activation.
+
+    .. math::
+        Tanh(x) = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            m = paddle.nn.Tanh()
+            out = m(x)
+            print(out.numpy())
+            # [-0.37994896 -0.19737532  0.09966799  0.29131261]
+    """
+
+    def __init__(self, name=None):
+        super(Tanh, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.tanh(x, self._name)
+
+
+class Hardtanh(layers.Layer):
+    """
+    Hardtanh Activation
+
+    .. math::
+
+        Hardtanh(x)= \\begin{cases}
+                        max, \\text{if } x > max \\\\
+                        min, \\text{if } x < min \\\\
+                        x,  \\text{otherwise}
+                      \\end{cases}
+
+    Parameters:
+        min (float, optional): The value of min for Hardtanh. Default is -1.
+        max (float, optional): The value of max for Hardtanh. Default is 1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1.5, 0.3, 2.5]))
+            m = paddle.nn.Hardtanh()
+            out = m(x) # # [-1., 0.3, 1.]
+    """
+
+    def __init__(self, min=-1.0, max=1.0, name=None):
+        super(Hardtanh, self).__init__()
+        self._min = min
+        self._max = max
+        self._name = name
+
+    def forward(self, x):
+        return F.hardtanh(x, self._min, self._max, self._name)
+
+
 class HSigmoid(layers.Layer):
     """
 	:alias_main: paddle.nn.HSigmoid
@@ -314,11 +404,79 @@ class HSigmoid(layers.Layer):
         return out
 
 
+class PReLU(layers.Layer):
+    """
+    PReLU Activation.
+
+    .. math::
+
+        PReLU(x) = max(0, x) + weight * min(0, x)
+
+    Parameters:
+        num_parameters (int, optional): Number of `weight` to learn. The supported values are:
+            1 - a single parameter `alpha` is used for all input channels; 
+            Number of channels - a seperate `alpha` is used for each input channel.
+            Default is 1.
+        init (float, optional): Init value of learnable `weight`. Default is 0.25.
+        weight_attr(ParamAttr, optional): The parameter attribute for the learnable `weight`. 
+            Default is None. For more information, please refer to :ref:`api_fluid_ParamAttr`.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Shape:
+        - input: Tensor with any shape. Default dtype is float32.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            paddle.set_default_dtype("float64")
+
+            data = np.array([[[[-2.0,  3.0, -4.0,  5.0],
+                            [ 3.0, -4.0,  5.0, -6.0],
+                            [-7.0, -8.0,  8.0,  9.0]],
+                            [[ 1.0, -2.0, -3.0,  4.0],
+                            [-5.0,  6.0,  7.0, -8.0],
+                            [ 6.0,  7.0,  8.0,  9.0]]]], 'float64')
+            x = paddle.to_tensor(data)
+            m = paddle.nn.PReLU(1, 0.25)
+            out = m(x)
+            # [[[[-0.5 ,  3.  , -1.  ,  5.  ],
+            #    [ 3.  , -1.  ,  5.  , -1.5 ],
+            #    [-1.75, -2.  ,  8.  ,  9.  ]],
+            #   [[ 1.  , -0.5 , -0.75,  4.  ],
+            #    [-1.25,  6.  ,  7.  , -2.  ],
+            #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
+    """
+
+    def __init__(self, num_parameters=1, init=0.25, weight_attr=None,
+                 name=None):
+        super(PReLU, self).__init__()
+        self._num_parameters = num_parameters
+        self._init = init
+        self._weight_attr = weight_attr
+        self._name = name
+
+        self._weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=[self._num_parameters],
+            dtype=get_default_dtype(),
+            is_bias=False,
+            default_initializer=Constant(self._init))
+
+    def forward(self, x):
+        return F.prelu(x, self._weight)
+
+
 class ReLU(layers.Layer):
     """
     ReLU Activation.
 
-    .. math:
+    .. math::
 
         ReLU(x) = max(x, 0)
 
@@ -333,14 +491,14 @@ class ReLU(layers.Layer):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
-        m = paddle.nn.ReLU()
-        out = m(x) # [0., 0., 1.]
+            x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
+            m = paddle.nn.ReLU()
+            out = m(x) # [0., 0., 1.]
     """
 
     def __init__(self, name=None):
@@ -351,17 +509,108 @@ class ReLU(layers.Layer):
         return F.relu(x, self._name)
 
 
+class ReLU6(layers.Layer):
+    """
+    ReLU6 Activation
+
+    .. math::
+
+        ReLU6(x) = min(max(0,x), 6)
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1, 0.3, 6.5]))
+            m = paddle.nn.ReLU6()
+            out = m(x) # [0, 0.3, 6]
+    """
+
+    def __init__(self, name=None):
+        super(ReLU6, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.relu6(x, self._name)
+
+
+class SELU(layers.Layer):
+    """
+    SELU Activation
+
+    .. math::
+
+        SELU(x)= scale *
+                 \\begin{cases}
+                   x, \\text{if } x > 0 \\\\
+                   alpha * e^{x} - alpha, \\text{if } x <= 0
+                 \\end{cases}
+
+    Parameters:
+        scale (float, optional): The value of scale(must be greater than 1.0) for SELU. Default is 1.0507009873554804934193349852946
+        alpha (float, optional): The value of alpha(must be no less than zero) for SELU. Default is 1.6732632423543772848170429916717
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[0.0, 1.0],[2.0, 3.0]]))
+            m = paddle.nn.SELU()
+            out = m(x) # [[0, 1.050701],[2.101402, 3.152103]]
+    """
+
+    def __init__(self,
+                 scale=1.0507009873554804934193349852946,
+                 alpha=1.6732632423543772848170429916717,
+                 name=None):
+        super(SELU, self).__init__()
+        self._scale = scale
+        self._alpha = alpha
+        self._name = name
+
+    def forward(self, x):
+        return F.selu(x, self._scale, self._alpha, self._name)
+
+
 class LeakyReLU(layers.Layer):
     """
     Leaky ReLU Activation.
 
-    .. math:
+    .. math::
 
-        out = max(x, alpha * x)
+        LeakyReLU(x)=
+            \\left\\{
+            \\begin{aligned}
+            &x, & & if \\ x >= 0 \\\\
+            &negative\_slope * x, & & otherwise \\\\
+            \\end{aligned}
+            \\right. \\\\
 
     Parameters:
-        alpha (float, optional): Slope of the activation function at :math:`x < 0` .
-            Default: 0.01.
+        negative_slope (float, optional): Slope of the activation function at
+            :math:`x < 0` . Default is 0.01.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
     
@@ -372,23 +621,23 @@ class LeakyReLU(layers.Layer):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        lrelu = paddle.nn.LeakyReLU()
-        x = paddle.to_tensor(np.array([-2, 0, 1], 'float32'))
-        out = lrelu(x)  # [-0.02, 0., 1.]
+            m = paddle.nn.LeakyReLU()
+            x = paddle.to_tensor(np.array([-2, 0, 1], 'float32'))
+            out = m(x)  # [-0.02, 0., 1.]
     """
 
-    def __init__(self, alpha=1e-2, name=None):
+    def __init__(self, negative_slope=0.01, name=None):
         super(LeakyReLU, self).__init__()
-        self._alpha = alpha
+        self._negative_slope = negative_slope
         self._name = name
 
     def forward(self, x):
-        return F.leaky_relu(x, self._alpha, self._name)
+        return F.leaky_relu(x, self._negative_slope, self._name)
 
 
 class Sigmoid(layers.Layer):
@@ -397,7 +646,7 @@ class Sigmoid(layers.Layer):
     
     .. math::
 
-        output = \\frac{1}{1 + e^{-x}}
+        Sigmoid(x) = \frac{1}{1 + e^{-x}}
     
     Parameters:
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
@@ -418,7 +667,7 @@ class Sigmoid(layers.Layer):
           paddle.disable_static()
           input_data = np.array([1.0, 2.0, 3.0, 4.0]).astype('float32')
           m = paddle.nn.Sigmoid()
-          x = paddle.to_variable(input_data)
+          x = paddle.to_tensor(input_data)
           output = m(x)
           print(output.numpy()) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
     """
@@ -431,13 +680,172 @@ class Sigmoid(layers.Layer):
         return F.sigmoid(x, self.name)
 
 
+class Softplus(layers.Layer):
+    """
+    Softplus Activation
+
+    .. math::
+
+        Softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\
+        \\text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
+
+    Parameters:
+        beta (float, optional): The value of beta for Softplus. Default is 1
+        threshold (float, optional): The value of threshold for Softplus. Default is 20
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            m = paddle.nn.Softplus()
+            out = m(x) # [0.513015, 0.598139, 0.744397, 0.854355]
+    """
+
+    def __init__(self, beta=1, threshold=20, name=None):
+        super(Softplus, self).__init__()
+        self._beta = beta
+        self._threshold = threshold
+        self._name = name
+
+    def forward(self, x):
+        return F.softplus(x, self._beta, self._threshold, self._name)
+
+
+class Softshrink(layers.Layer):
+    """
+    Softshrink Activation
+
+    .. math::
+
+        Softshrink(x)= \\begin{cases}
+                        x - threshold, \\text{if } x > threshold \\\\
+                        x + threshold, \\text{if } x < -threshold \\\\
+                        0,  \\text{otherwise}
+                      \\end{cases}
+
+    Parameters:
+        threshold (float, optional): The value of threshold(must be no less than zero) for softplus. Default is 0.5
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
+            m = paddle.nn.Softshrink()
+            out = m(x) # [-0.4, 0, 0, 0.3]
+    """
+
+    def __init__(self, threshold=0.5, name=None):
+        super(Softshrink, self).__init__()
+        self._threshold = threshold
+        self._name = name
+
+    def forward(self, x):
+        return F.softshrink(x, self._threshold, self._name)
+
+
+class Softsign(layers.Layer):
+    """
+    Softsign Activation
+
+    .. math::
+
+        Softsign(x) = \\frac{x}{1 + |x|}
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            m = paddle.nn.Softsign()
+            out = m(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
+    """
+
+    def __init__(self, name=None):
+        super(Softsign, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.softsign(x, self._name)
+
+
+class Tanhshrink(layers.Layer):
+    """
+    Tanhshrink Activation
+
+    .. math::
+
+        Tanhshrink(x) = x - tanh(x)
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            m = paddle.nn.Tanhshrink()
+            out = m(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
+    """
+
+    def __init__(self, name=None):
+        super(Tanhshrink, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.tanhshrink(x, self._name)
+
+
 class LogSigmoid(layers.Layer):
     """
     LogSigmoid Activation.
     
-    .. math:
+    .. math::
 
-        LogSigmoid(x) = \log \frac{1}{1 + e^{-x}}
+        LogSigmoid(x) = log \\frac{1}{1 + e^{-x}}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, or float64.
@@ -451,14 +859,14 @@ class LogSigmoid(layers.Layer):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
-        m = paddle.nn.LogSigmoid()
-        out = m(x) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
+            x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+            m = paddle.nn.LogSigmoid()
+            out = m(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
     """
 
     def __init__(self, name=None):
@@ -469,6 +877,137 @@ class LogSigmoid(layers.Layer):
         return F.logsigmoid(x, self._name)
 
 
+class Softmax(layers.Layer):
+    """
+    Softmax Activation.
+
+    This operator implements the softmax layer. The calculation process is as follows:
+
+    1. The dimension :attr:`axis` of ``x`` will be permuted to the last.
+
+    2. Then ``x`` will be logically flattened to a 2-D matrix. The matrix's second
+    dimension(row length) is the same as the dimension :attr:`axis` of ``x``,
+    and the first dimension(column length) is the product of all other dimensions
+    of ``x``. For each row of the matrix, the softmax operator squashes the
+    K-dimensional(K is the width of the matrix, which is also the size of ``x``'s
+    dimension :attr:`axis`) vector of arbitrary real values to a K-dimensional
+    vector of real values in the range [0, 1] that add up to 1.
+
+    3. After the softmax operation is completed, the inverse operations of steps 1 and 2
+    are performed to restore the two-dimensional matrix to the same dimension as the ``x`` .
+
+    It computes the exponential of the given dimension and the sum of exponential
+    values of all the other dimensions in the K-dimensional vector input.
+    Then the ratio of the exponential of the given dimension and the sum of
+    exponential values of all the other dimensions is the output of the softmax
+    operator.
+
+    For each row :math:`i` and each column :math:`j` in the matrix, we have:
+
+    .. math::
+
+        Softmax[i, j] = \\frac{\\exp(x[i, j])}{\\sum_j(exp(x[i, j])}
+
+    Example:
+
+    .. code-block:: text
+
+        Case 1:
+          Input:
+            x.shape = [2, 3, 4]
+            x.data = [[[2.0, 3.0, 4.0, 5.0],
+                       [3.0, 4.0, 5.0, 6.0],
+                       [7.0, 8.0, 8.0, 9.0]],
+                      [[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [6.0, 7.0, 8.0, 9.0]]]
+
+          Attrs:
+            axis = -1
+
+          Output:
+            out.shape = [2, 3, 4]
+            out.data = [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+                        [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+
+        Case 2:
+          Input:
+            x.shape = [2, 3, 4]
+            x.data = [[[2.0, 3.0, 4.0, 5.0],
+                       [3.0, 4.0, 5.0, 6.0],
+                       [7.0, 8.0, 8.0, 9.0]],
+                      [[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [6.0, 7.0, 8.0, 9.0]]]
+          Attrs:
+            axis = 1
+
+          Output:
+            out.shape = [2, 3, 4]
+            out.data = [[[0.00657326, 0.00657326, 0.01714783, 0.01714783],
+                         [0.01786798, 0.01786798, 0.04661262, 0.04661262],
+                         [0.97555875, 0.97555875, 0.93623955, 0.93623955]],
+                        [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
+                         [0.26762315, 0.26762315, 0.26762315, 0.26762315],
+                         [0.72747516, 0.72747516, 0.72747516, 0.72747516]]]
+
+    Parameters:
+        axis (int, optional): The axis along which to perform log_softmax
+            calculations. It should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` < 0, it works the same way as
+            :math:`axis + D` . Default is -1.
+        dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
+            type of the output tensor. If dtype is specified, ``x`` is casted
+            to ``dtype`` before the operation is performed. This is useful for 
+            preventing data type overflows. Supported dtype: float32, float64.
+            If ``dtype`` is None, the output Tensor has the same dtype as x.
+            Default is None.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = np.array([[[2.0, 3.0, 4.0, 5.0],
+                        [3.0, 4.0, 5.0, 6.0],
+                        [7.0, 8.0, 8.0, 9.0]],
+                        [[1.0, 2.0, 3.0, 4.0],
+                        [5.0, 6.0, 7.0, 8.0],
+                        [6.0, 7.0, 8.0, 9.0]]], 'float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.Softmax()
+            out = m(x)
+            # [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+            # [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+    """
+
+    def __init__(self, axis=-1, name=None):
+        super(Softmax, self).__init__()
+        self._axis = axis
+        self._dtype = None
+        self._name = name
+
+    def forward(self, x):
+        return F.softmax(x, self._axis, self._dtype, self._name)
+
+
 class LogSoftmax(layers.Layer):
     """
     This operator implements the log_softmax layer. The calculation process is as follows:
@@ -476,7 +1015,7 @@ class LogSoftmax(layers.Layer):
     .. math::
 
         Out[i, j] = log(softmax(x)) 
-                  = log(\\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])})
+                  = log(\\frac{\exp(X[i, j])}{\\sum_j(exp(X[i, j])})
 
     Parameters:
         axis (int, optional): The axis along which to perform log_softmax
@@ -493,26 +1032,26 @@ class LogSoftmax(layers.Layer):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
-
-        paddle.disable_static()
-
-        x = np.array([[[-2.0, 3.0, -4.0, 5.0],
-                        [3.0, -4.0, 5.0, -6.0],
-                        [-7.0, -8.0, 8.0, 9.0]],
-                        [[1.0, -2.0, -3.0, 4.0],
-                        [-5.0, 6.0, 7.0, -8.0],
-                        [6.0, 7.0, 8.0, 9.0]]])
-        m = paddle.nn.LogSoftmax()
-        x = paddle.to_tensor(x)
-        out = m(x)
-        # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
-        #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
-        #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
-        #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
-        #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
-        #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = np.array([[[-2.0, 3.0, -4.0, 5.0],
+                           [3.0, -4.0, 5.0, -6.0],
+                           [-7.0, -8.0, 8.0, 9.0]],
+                          [[1.0, -2.0, -3.0, 4.0],
+                           [-5.0, 6.0, 7.0, -8.0],
+                           [6.0, 7.0, 8.0, 9.0]]])
+            m = paddle.nn.LogSoftmax()
+            x = paddle.to_tensor(x)
+            out = m(x)
+            # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
+            #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
+            #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
+            #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
+            #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
+            #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
     """
 
     def __init__(self, axis=-1, name=None):
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 45259bea49d42eb07e0e593531a1680359f81a68..a1e6508c67d96e9f6cc077efe6e61d708674b057 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -12,21 +12,125 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the common classes to build a neural network  
+# TODO: define the common classes to build a neural network
 from ...fluid.dygraph import BilinearTensorProduct  #DEFINE_ALIAS
 from ...fluid.dygraph import Pool2D  #DEFINE_ALIAS
 from ...fluid.dygraph import Embedding  #DEFINE_ALIAS
-from ...fluid.dygraph import Linear  #DEFINE_ALIAS
 from ...fluid.dygraph import Flatten  #DEFINE_ALIAS
 from ...fluid.dygraph import layers
 from .. import functional as F
+from ...fluid.framework import _dygraph_tracer
 
 __all__ = [
-    'BilinearTensorProduct', 'Pool2D', 'Embedding', 'Linear', 'UpSample',
-    'Pad2D'
+    'BilinearTensorProduct',
+    'Pool2D',
+    'Embedding',
+    'Linear',
+    'UpSample',
+    'Pad2D',
+    'UpsamplingNearest2d',
+    'UpsamplingBilinear2d',
+    'ReflectionPad1d',
+    'ReplicationPad1d',
+    'ConstantPad1d',
+    'ReflectionPad2d',
+    'ReplicationPad2d',
+    'ConstantPad2d',
+    'ZeroPad2d',
+    'ConstantPad3d',
+    'ReplicationPad3d',
+    'CosineSimilarity',
+    'Dropout',
+    'Dropout2D',
+    'Dropout3D',
+    'Bilinear',
+    'AlphaDropout',
 ]
 
 
+class Linear(layers.Layer):
+    """
+    
+    Fully-connected linear transformation layer:
+
+    .. math::
+
+        Out = {XW + b}
+
+    where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively.
+
+    Linear layer takes only one ``Tensor`` input.
+    The Linear layer multiplies input tensor with weight matrix and
+    produces an output Tensor of shape [N, *, `output_dim`],
+    where N is batch size and `*` means any number of additional dimensions.
+    If ``bias_attr`` is not None, a bias variable will be created and added to the output.
+
+    Parameters:
+        in_features(int): The number of input units in this layer.
+        out_features(int): The number of output units in this layer.
+        weight_attr(ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable
+            weights(Parameter) of this layer. Default: None.
+        bias_attr(ParamAttr or list of ParamAttr, optional): The attribute for the bias
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`. Default: None.
+
+    Attributes:
+        **weight** (Parameter): the learnable weights of this layer.
+
+        **bias** (Parameter or None): the learnable bias of this layer.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle import nn
+          import numpy as np
+
+          data = np.ones((3,1,2), np.float32)
+          place = paddle.CPUPlace()
+          paddle.disable_static(place)
+          data = paddle.to_tensor(data)
+          weight_attr=paddle.framework.ParamAttr(name="linear_weight", learning_rate=1.0,
+          trainable=False, regularizer=None, initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
+          bias_attr=paddle.framework.ParamAttr(name="linear_bias", learning_rate=1.0,
+          trainable=False, regularizer=None, initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
+          linear = nn.Linear(2,2,weight_attr=weight_attr, bias_attr=bias_attr)
+          res = linear(data)  # [3 3 3 3 3 3]
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(Linear, self).__init__()
+        self._dtype = self._helper.get_default_dtype()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self.name = name
+        self.weight = self.create_parameter(
+            shape=[in_features, out_features],
+            attr=self._weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.bias = self.create_parameter(
+            shape=[out_features],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.name = name
+
+    def forward(self, input):
+        out = F.linear(
+            x=input, weight=self.weight, bias=self.bias, name=self.name)
+        return out
+
+
 class UpSample(layers.Layer):
     """
     This op resizes a batch of images.
@@ -34,8 +138,7 @@ class UpSample(layers.Layer):
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
     and the resizing only applies on the three dimensions(depth, height and width).
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the
-    future and only use :attr:`out_shape` instead.
+
     Supporting resample methods:
         'linear' : Linear interpolation
         'bilinear' : Bilinear interpolation
@@ -65,7 +168,7 @@ class UpSample(layers.Layer):
     interpolating functions of three variables (e.g. D-direction,
     H-direction and W-direction in this op) on a rectilinear 3D grid.
     The linear interpolation is performed on three directions.
-    Align_corners and align_mode are optional parameters,the calculation method
+    align_corners and align_mode are optional parameters,the calculation method
     of interpolation can be selected by them.
 
     Example:
@@ -163,16 +266,16 @@ class UpSample(layers.Layer):
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
     
     Parameters:
-        input (Variable): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
+        x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
-        size (list|tuple|Variable|None): Output shape of image resize
+        size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Variable|None): The multiplier for the input height or width. At
+        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
              least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.Has to match input size if it is a list.
              Default: None.
         mode (str): The resample method. It supports 'linear', 'nearst', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
@@ -196,7 +299,7 @@ class UpSample(layers.Layer):
         A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
         or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
     Raises:
-        TypeError: size should be a list or tuple or Variable.
+        TypeError: size should be a list or tuple or Tensor.
         ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
                     'trilinear', 'bicubic', or 'nearest' currently.
         ValueError: 'linear' only support 3-D tensor.
@@ -214,16 +317,18 @@ class UpSample(layers.Layer):
     Examples:
         .. code-block:: python
             import paddle
+            import paddle.nn as nn
             import numpy as np
-            import paddle.fluid.dygraph as dg
-            upsample_op = paddle.nn.UpSample(size=[12,12])
+            paddle.disable_static()
+
             input_data = np.random.rand(2,3,6,10).astype("float32")
-            place = paddle.fluid.CPUPlace()
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                output = upsample_op(input=input)
-                print(output.shape)
-                # [2L, 3L, 12L, 12L]
+            upsample_out  = paddle.nn.UpSample(size=[12,12])
+
+            input = paddle.to_tensor(input_data)
+            output = upsample_out(x=input)
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
+
     """
 
     def __init__(self,
@@ -231,8 +336,9 @@ class UpSample(layers.Layer):
                  scale_factor=None,
                  mode='nearest',
                  align_corners=False,
-                 align_mode=1,
-                 data_format='NCHW'):
+                 align_mode=0,
+                 data_format='NCHW',
+                 name=None):
         super(UpSample, self).__init__()
         self.size = size
         self.scale_factor = scale_factor
@@ -240,16 +346,184 @@ class UpSample(layers.Layer):
         self.align_corners = align_corners
         self.align_mode = align_mode
         self.data_format = data_format
+        self.name = name
 
-    def forward(self, input):
+    def forward(self, x):
         out = F.interpolate(
-            input,
+            x,
             size=self.size,
             scale_factor=self.scale_factor,
             mode=self.mode,
             align_corners=self.align_corners,
             align_mode=self.align_mode,
-            data_format=self.data_format)
+            data_format=self.data_format,
+            name=self.name)
+
+        return out
+
+
+class UpsamplingNearest2d(layers.Layer):
+    """
+    This op upsamples a batch of images, using nearest neighbours' pixel values.
+    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
+    and the upsampling only applies on the two dimensions(height and width).
+
+    Nearest neighbor interpolation is to perform nearest neighbor interpolation
+    in both the 3rd dimension(in height direction) and the 4th dimension(in width
+    direction) on input tensor.
+    
+    For details of nearest neighbor interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
+    
+        x (Tensor): 4-D Tensor, its data type is float32, float64, or uint8,
+                          its data format is specified by :attr:`data_format`.
+        size (list|tuple|Tensor|None): Output shape of image resize
+             layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
+             Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
+             If a Tensor Variable, its dimensions size should be a 1.
+        scale_factor (float|int|list|Tensor|None): The multiplier for the input height or width. At
+             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             Default: None. Has to match input size if it is a list.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
+            `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`. When it is `"NCHW"`, the data is stored
+            in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+    Returns:
+        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
+    Raises:
+        TypeError: size should be a list or tuple or Tensor.
+        ValueError: 'nearest' only support 4-D tensor.
+        ValueError: One of size and scale_factor must not be None.
+        ValueError: size length should be 2 for input 4-D tensor.
+        ValueError: scale_factor should be greater than zero.
+        ValueError: data_format can only be 'NCHW', 'NHWC'.
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_data = np.random.rand(2,3,6,10).astype("float32")
+            upsample_out  = paddle.nn.UpsamplingNearest2d(size=[12,12])
+
+            input = paddle.to_tensor(input_data)
+            output = upsample_out(x=input)
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
+
+    """
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 data_format='NCHW',
+                 name=None):
+        super(UpsamplingNearest2d, self).__init__()
+        self.size = size
+        self.scale_factor = scale_factor
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        out = F.interpolate(
+            x,
+            size=self.size,
+            scale_factor=self.scale_factor,
+            mode='nearest',
+            align_corners=False,
+            align_mode=0,
+            data_format=self.data_format,
+            name=self.name)
+
+        return out
+
+
+class UpsamplingBilinear2d(layers.Layer):
+    """
+    This op upsamples a batch of images, using bilinear' pixel values.
+    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
+    and the upsampling only applies on the two dimensions(height and width).
+
+    Bilinear interpolation is an extension of linear interpolation for
+    interpolating functions of two variables (e.g. H-direction and
+    W-direction in this op) on a rectilinear 2D grid. The key idea is
+    to perform linear interpolation first in one direction, and then
+    again in the other direction.
+    
+    For details of bilinear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Bilinear_interpolation.
+    
+        x (Tensor): 4-D Tensor, its data type is float32, float64, or uint8,
+                          its data format is specified by :attr:`data_format`.
+        size (list|tuple|Tensor|None): Output shape of image resize
+             layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
+             Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
+             If a Tensor Variable, its dimensions size should be a 1.
+        scale_factor (float|int|list|Tensor|None): The multiplier for the input height or width. At
+             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             Default: None. Has to match input size if it is a list.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
+            `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`. When it is `"NCHW"`, the data is stored
+            in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+    Returns:
+        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
+    Raises:
+        TypeError: size should be a list or tuple or Tensor.
+        ValueError: 'bilinear' only support 4-D tensor.
+        ValueError: One of size and scale_factor must not be None.
+        ValueError: size length should be 2 for input 4-D tensor.
+        ValueError: scale_factor should be greater than zero.
+        ValueError: data_format can only be 'NCHW', 'NHWC'.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_data = np.random.rand(2,3,6,10).astype("float32")
+            upsample_out  = paddle.nn.UpsamplingBilinear2d(size=[12,12])
+
+            input = paddle.to_tensor(input_data)
+            output = upsample_out(x=input)
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
+    """
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 data_format='NCHW',
+                 name=None):
+        super(UpsamplingBilinear2d, self).__init__()
+        self.size = size
+        self.scale_factor = scale_factor
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        out = F.interpolate(
+            x,
+            size=self.size,
+            scale_factor=self.scale_factor,
+            mode='bilinear',
+            align_corners=True,
+            align_mode=0,
+            data_format=self.data_format,
+            name=self.name)
 
         return out
 
@@ -258,12 +532,10 @@ class Pad2D(layers.Layer):
     """
         :alias_main: paddle.nn.Pad2D
         :alias: paddle.nn.Pad2D,paddle.nn.layer.Pad2D,paddle.nn.layer.common.Pad2D
-
     This interface is used to construct a callable object of the ``Pad2D``  class.
     The Pad2D layer pads the input tensor boundaries according to 'paddings' and 'mode'.
     If mode is 'reflect', paddings[0] and paddings[1] must be no greater
     than height-1. And the width dimension has the same condition.
-
     Parameters:
         paddings (int | List[int32]): The padding size. If padding is a int, uses the same 
             padding in all boundaries, if padding is a List, it must contain four integers, 
@@ -278,16 +550,12 @@ class Pad2D(layers.Layer):
         data_format (str): An string from: "NHWC", "NCHW". Specify the data format of
                            the input data.
                            Default is  "NCHW"
-
     Returns: 
         None
-
     Examples:
         .. code-block:: text
-
             Input = [[[[1., 2., 3.],
                        [4., 5., 6.]]]]
-
             Case 0:
                 paddings = [0, 1, 2, 3],
                 mode = 'constant'
@@ -295,24 +563,20 @@ class Pad2D(layers.Layer):
                 Out = [[[[0., 0., 1., 2., 3., 0., 0., 0.],
                          [0., 0., 4., 5., 6., 0., 0., 0.],
                          [0., 0., 0., 0., 0., 0., 0., 0.]]]]
-
             Case 1:
                 paddings = [0, 1, 2, 1],
                 mode = 'reflect'
                 Out = [[[[3., 2., 1., 2., 3., 2.],
                          [6., 5., 4., 5., 6., 5.],
                          [3., 2., 1., 2., 3., 2.]]]]
-
             Case 2:
                 paddings = [0, 1, 2, 1],
                 mode = 'edge'
                 Out = [[[[1., 1., 1., 2., 3., 3.],
                          [4., 4., 4., 5., 6., 6.],
                          [4., 4., 4., 5., 6., 6.]]]]
-
     Code Examples:
         .. code-block:: python
-
             import paddle.fluid as fluid
             import paddle.nn as nn
             import numpy as np
@@ -342,3 +606,944 @@ class Pad2D(layers.Layer):
             mode=self._mode,
             pad_value=self._pad_value,
             data_format=self._data_format)
+
+
+class Bilinear(layers.Layer):
+    """
+
+    This layer performs bilinear on two inputs.
+
+    .. math::
+
+      out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,size-1
+
+      out = out + b
+
+    In this formula:
+     - :math:`x1`: the first input contains in1_features elements, shape is [batch_size, in1_features].
+     - :math:`x2`: the second input contains in2_features elements, shape is [batch_size, in2_features].
+     - :math:`W_{i}`: the i-th learned weight, shape is [in1_features, in2_features], and learned weight's shape is [out_features, in1_features, in2_features].
+     - :math:`out_{i}`: the i-th element of out, shape is [batch_size, out_features].
+     - :math:`b`: the learned bias, shape is [1, out_features].
+     - :math:`x2^\mathrm{T}`: the transpose of :math:`x2`.
+
+    Parameters:
+       in1_features (int): The dimension of each first input(`x1`).
+       in2_features (int): The dimension of each second input(`x2`).
+       out_features (int): The dimension of output of this layer.
+       weight_attr (ParamAttr, optional): The parameter attribute for the learnable w, parameters/weights of 
+       this layer. The default value is None.
+       bias_attr (ParamAttr, optional): The parameter attribute for the bias
+           of this layer. If it is set to False, no bias will be added to the output units.
+           If it is set to None, the bias is initialized zero. The default value is None.       
+       name (str, optional): The default value is None. Normally there is no need for user
+           to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
+
+    Attribute:
+        **weight** (Parameter): the learnable weights of this layer.
+
+        **bias** (Parameter): the learnable bias of this layer.
+
+    Returns:
+       Tensor: A 2-D Tensor of shape [batch_size, out_features].
+
+    Examples:
+       .. code-block:: python
+
+        import paddle
+        import numpy
+
+        paddle.disable_static()
+        layer1 = numpy.random.random((5, 5)).astype('float32')
+        layer2 = numpy.random.random((5, 4)).astype('float32')
+        bilinear = paddle.nn.Bilinear(
+            in1_features=5, in2_features=4, out_features=1000)
+        result = bilinear(paddle.to_tensor(layer1),
+                        paddle.to_tensor(layer2))     # result shape [5, 1000]
+
+    """
+
+    def __init__(self,
+                 in1_features,
+                 in2_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(Bilinear, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._name = name
+        self._in1_features = in1_features
+        self._in2_features = in2_features
+        self._out_features = out_features
+        self._dtype = self._helper.get_default_dtype()
+
+        weight_shape = [
+            self._out_features, self._in1_features, self._in2_features
+        ]
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=weight_shape,
+            dtype=self._dtype,
+            is_bias=False)
+        bias_shape = [1, self._out_features]
+        self.bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=bias_shape,
+            dtype=self._dtype,
+            is_bias=True)
+
+    def forward(self, x1, x2):
+        return F.bilinear(x1, x2, self.weight, self.bias, self._name)
+
+
+class Dropout(layers.Layer):
+    """
+    Dropout is a regularization technique for reducing overfitting by preventing
+    neuron co-adaption during training as described in the paper:
+    `Improving neural networks by preventing co-adaptation of feature detectors <https://arxiv.org/abs/1207.0580>`_ 
+    The dropout operator randomly sets the outputs of some units to zero, while upscale others
+    according to the given dropout probability.
+
+    See ``paddle.nn.functional.dropout`` for more details.
+
+    In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
+
+    Parameters:
+        p (float | int): Probability of setting units to zero. Default: 0.5
+        axis (int | list): The axis along which the dropout is performed. Default None.
+        mode(str, optional): ['upscale_in_train'(default) | 'downscale_in_infer']
+
+                               1. upscale_in_train(default), upscale the output at training time
+
+                                  - train: out = input * mask / ( 1.0 - p )
+                                  - inference: out = input
+
+                               2. downscale_in_infer, downscale the output at inference
+
+                                  - train: out = input * mask
+                                  - inference: out = input * (1.0 - p)
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: N-D tensor.
+        - output: N-D tensor, the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.array([[1,2,3], [4,5,6]]).astype('float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.Dropout(p=0.5)
+            y_train = m(x)
+            m.eval()  # switch the model to test phase
+            y_test = m(x)
+            print(x.numpy())
+            print(y_train.numpy())
+            print(y_test.numpy())
+   """
+
+    def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None):
+        super(Dropout, self).__init__()
+
+        self.p = p
+        self.axis = axis
+        self.mode = mode
+        self.name = name
+
+    def forward(self, input):
+        out = F.dropout(
+            input,
+            p=self.p,
+            axis=self.axis,
+            training=self.training,
+            mode=self.mode,
+            name=self.name)
+        return out
+
+
+class Dropout2D(layers.Layer):
+    """
+    Randomly zero out entire channels (in the batched input 4d tensor with the shape `NCHW` ,
+    a channel is a 2D feature map with the shape `HW`). Each channel will be zeroed out independently
+    on every forward call with probability `p` using samples from a Bernoulli distribution.
+    Dropout2d will help promote independence between feature maps as described in the paper: 
+    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_ 
+
+    See ``paddle.nn.functional.dropout2d`` for more details.
+
+    In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
+
+    Parameters:
+        p (float, optional): Probability of setting units to zero. Default: 0.5
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+                                     will be consistent with that of the input. An optional string from:
+                                    `NCHW`, `NHWC`. The default is `NCHW`. When it is `NCHW`, the data is
+                                     stored in the order of: [batch_size, input_channels, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: 4-D tensor.
+        - output: 4-D tensor, the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.random(size=(2, 3, 4, 5)).astype('float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.Dropout2D(p=0.5)
+            y_train = m(x)
+            m.eval()  # switch the model to test phase
+            y_test = m(x)
+            print(x.numpy())
+            print(y_train.numpy())
+            print(y_test.numpy())
+   """
+
+    def __init__(self, p=0.5, data_format='NCHW', name=None):
+        super(Dropout2D, self).__init__()
+
+        self.p = p
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, input):
+        out = F.dropout2d(
+            input,
+            p=self.p,
+            training=self.training,
+            data_format=self.data_format,
+            name=self.name)
+        return out
+
+
+class Dropout3D(layers.Layer):
+    """
+    Randomly zero out entire channels (in the batched input 5d tensor with the shape `NCDHW` ,
+    a channel is a 3D feature map with the shape `DHW` ). Each channel will be zeroed out independently
+    on every forward call with probability `p` using samples from a Bernoulli distribution.
+    Dropout3d will help promote independence between feature maps as described in the paper: 
+    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_ 
+
+    See ``paddle.nn.functional.dropout3d`` for more details.
+
+    In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
+
+    Parameters:
+        p (float | int): Probability of setting units to zero. Default: 0.5
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+                                     will be consistent with that of the input. An optional string from:
+                                    `NCDHW`, `NDHWC`. The default is `NCDHW`. When it is `NCDHW`, the data is
+                                     stored in the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: 5-D tensor.
+        - output: 5-D tensor, the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.random(size=(2, 3, 4, 5, 6)).astype('float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.Dropout3D(p=0.5)
+            y_train = m(x)
+            m.eval()  # switch the model to test phase
+            y_test = m(x)
+            print(x.numpy())
+            print(y_train.numpy())
+            print(y_test.numpy())
+   """
+
+    def __init__(self, p=0.5, data_format='NCDHW', name=None):
+        super(Dropout3D, self).__init__()
+
+        self.p = p
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, input):
+        out = F.dropout3d(
+            input,
+            p=self.p,
+            training=self.training,
+            data_format=self.data_format,
+            name=self.name)
+        return out
+
+
+class AlphaDropout(layers.Layer):
+    """
+    Alpha Dropout is a type of Dropout that maintains the self-normalizing property. For an input with
+    zero mean and unit standard deviation, the output of Alpha Dropout maintains the original mean and
+    standard deviation of the input. Alpha Dropout fits well to SELU activate function by randomly setting
+    activations to the negative saturation value.
+
+    For more information, please refer to:
+    `Self-Normalizing Neural Networks <https://arxiv.org/abs/1706.02515>`_
+
+    In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
+
+    Parameters:
+        p (float | int): Probability of setting units to zero. Default: 0.5
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: N-D tensor.
+        - output: N-D tensor, the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.array([[-1, 1], [-1, 1]]).astype('float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.AlphaDropout(p=0.5)
+            y_train = m(x)
+            m.eval()  # switch the model to test phase
+            y_test = m(x)
+            print(x.numpy())
+            print(y_train.numpy())
+            # [[-0.10721093, 1.6655989 ], [-0.7791938, -0.7791938]] (randomly)
+            print(y_test.numpy())
+   """
+
+    def __init__(self, p=0.5, name=None):
+        super(AlphaDropout, self).__init__()
+        self.p = p
+        self.name = name
+
+    def forward(self, input):
+        out = F.alpha_dropout(
+            input, p=self.p, training=self.training, name=self.name)
+        return out
+
+
+class ReflectionPad1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReflectionPad1d`` class.
+    Uses reflection of the input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right).
+        data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
+           Default is  "NCL"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[1., 2., 3.],
+                  [4., 5., 6.]]]
+            padding = [1, 2],
+            Out = [[[2. 1. 2. 3. 2. 1.]
+                    [5. 4. 5. 6. 5. 4.]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 2, 3)
+            pad = [1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReflectionPad1d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[2. 1. 2. 3. 2. 1.]
+            #   [5. 4. 5. 6. 5. 4.]]]
+    """
+
+    def __init__(self, padding, data_format="NCL", name=None):
+        super(ReflectionPad1d, self).__init__()
+        self._mode = "reflect"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ReplicationPad1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReplicationPad1d`` class.
+    Uses input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right).
+        data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
+           Default is  "NCL"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[1., 2., 3.],
+                  [4., 5., 6.]]]
+            padding = [1, 2],
+            Out = [[[2. 1. 2. 3. 2. 1.]
+                    [5. 4. 5. 6. 5. 4.]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 2, 3)
+            pad = [1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad1d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[1. 1. 2. 3. 3. 3.]
+            #   [1. 4. 5. 6. 6. 6.]]]
+    """
+
+    def __init__(self, padding, data_format="NCL", name=None):
+        super(ReplicationPad1d, self).__init__()
+        self._mode = "replicate"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ConstantPad1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ConstantPad1d`` class.
+    Uses a constant value to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right).
+        value (float32): The value to fill the padded areas. Default is 0.0
+        data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
+           Default is  "NCL"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[1., 2., 3.],
+                  [4., 5., 6.]]]
+            padding = [1, 2],
+            value = 0.0
+            Out = [[[0. 1. 2. 3. 0. 0.]
+                    [0. 4. 5. 6. 0. 0.]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 2, 3)
+            pad = [1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ConstantPad1d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[0. 1. 2. 3. 0. 0.]
+            #   [0. 4. 5. 6. 0. 0.]]]
+    """
+
+    def __init__(self, padding, value=0.0, data_format="NCL", name=None):
+        super(ConstantPad1d, self).__init__()
+        self._mode = "constant"
+        self._data_format = data_format
+        self._pad = padding
+        self._value = value
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     value=self._value,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ConstantPad2d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ConstantPad2d`` class.
+    Uses a constant value to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
+        value (float32): The value to fill the padded areas. Default is 0.0
+        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[1., 2., 3.],
+                   [4., 5., 6.]]]]
+            padding = [1, 1, 0, 0]
+            value = 0.0
+            Out = [[[[0. 1. 2. 3. 0.]
+                     [0. 4. 5. 6. 0.]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 2, 3)
+            pad = [1, 0, 1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ConstantPad2d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[0. 0. 0. 0.]
+            #    [0. 1. 2. 3.]
+            #    [0. 4. 5. 6.]
+            #    [0. 0. 0. 0.]
+            #    [0. 0. 0. 0.]]]]
+    """
+
+    def __init__(self, padding, value=0.0, data_format="NCHW", name=None):
+        super(ConstantPad2d, self).__init__()
+        self._mode = "constant"
+        self._data_format = data_format
+        self._pad = padding
+        self._value = value
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     value=self._value,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ZeroPad2d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ZeroPad2d`` class.
+    Uses 0 to pad the input tensor.
+
+    Parameters:
+        padding (Variable | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
+        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[1., 2., 3.],
+                   [4., 5., 6.]]]]
+            padding = [1, 1, 0, 0]
+            Out = [[[[0. 1. 2. 3. 0.]
+                     [0. 4. 5. 6. 0.]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 2, 3)
+            pad = [1, 0, 1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ZeroPad2d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[0. 0. 0. 0.]
+            #    [0. 1. 2. 3.]
+            #    [0. 4. 5. 6.]
+            #    [0. 0. 0. 0.]
+            #    [0. 0. 0. 0.]]]]
+    """
+
+    def __init__(self, padding, data_format="NCHW", name=None):
+        super(ZeroPad2d, self).__init__()
+        self._mode = "constant"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ReplicationPad2d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReplicationPad2d`` class.
+    Uses input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
+        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[1., 2., 3.],
+                   [4., 5., 6.]]]]
+            padding = [1, 1, 0, 0]
+            Out = [[[[1. 1. 2. 3. 3.]
+                     [4. 4. 5. 6. 6.]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 2, 3)
+            pad = [1, 0, 1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad2d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[1. 1. 2. 3.]
+            #    [1. 1. 2. 3.]
+            #    [4. 4. 5. 6.]
+            #    [4. 4. 5. 6.]
+            #    [4. 4. 5. 6.]]]]
+    """
+
+    def __init__(self, padding, data_format="NCHW", name=None):
+        super(ReplicationPad2d, self).__init__()
+        self._mode = "replicate"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ReflectionPad2d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReflectionPad2d`` class.
+    Uses reflection of the input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Variable | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
+        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[1., 2., 3.],
+                   [4., 5., 6.]]]]
+            padding = [1, 1, 0, 0]
+            Out = [[[[2. 1. 2. 3. 2.]
+                     [5. 4. 5. 6. 5.]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 4, 3)
+            pad = [1, 0, 1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReflectionPad2d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[ 5.  4.  5.  6.]
+            #    [ 2.  1.  2.  3.]
+            #    [ 5.  4.  5.  6.]
+            #    [ 8.  7.  8.  9.]
+            #    [11. 10. 11. 12.]
+            #    [ 8.  7.  8.  9.]
+            #    [ 5.  4.  5.  6.]]]]
+    """
+
+    def __init__(self, padding, data_format="NCHW", name=None):
+        super(ReflectionPad2d, self).__init__()
+        self._mode = "reflect"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ConstantPad3d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ConstantPad3d`` class.
+    Uses a constant value to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
+        value (float32): The value to fill the padded areas. Default is 0.0
+        data_format (str): An string from: "NCDHW", "NDHWC". Specify the data format of the input data.
+           Default is  "NCDHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[[1., 2., 3.],
+                    [4., 5., 6.]]]]]
+            padding = [1, 2, 0, 0, 0, 0]
+            value = 0.0
+            Out = [[[[[0. 1. 2. 3. 0. 0.]
+                      [0. 4. 5. 6. 0. 0.]]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 1, 2, 3)
+            pad = [1, 0, 1, 2, 0, 0]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ConstantPad3d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[[0. 0. 0. 0.]
+            #     [0. 1. 2. 3.]
+            #     [0. 4. 5. 6.]
+            #     [0. 0. 0. 0.]
+            #     [0. 0. 0. 0.]]]]]
+    """
+
+    def __init__(self, padding, value=0.0, data_format="NCDHW", name=None):
+        super(ConstantPad3d, self).__init__()
+        self._mode = "constant"
+        self._data_format = data_format
+        self._pad = padding
+        self._value = value
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     value=self._value,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ReplicationPad3d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReplicationPad3d`` class.
+    Uses input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
+        data_format (str): An string from: "NCDHW", "NDHWC". Specify the data format of the input data.
+           Default is  "NCDHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[[1., 2., 3.],
+                    [4., 5., 6.]]]]]
+            padding = [1, 2, 0, 0, 0, 0]
+            Out = [[[[[1. 1. 2. 3. 3. 3.]
+                      [4. 4. 5. 6. 6. 6.]]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 1, 2, 3)
+            pad = [1, 0, 1, 2, 0, 0]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad3d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[[1. 1. 2. 3.]
+            #     [1. 1. 2. 3.]
+            #     [4. 4. 5. 6.]
+            #     [4. 4. 5. 6.]
+            #     [4. 4. 5. 6.]]]]]
+    """
+
+    def __init__(self, padding, data_format="NCDHW", name=None):
+        super(ReplicationPad3d, self).__init__()
+        self._mode = "replicate"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class CosineSimilarity(layers.Layer):
+    """
+    This interface is used to compute cosine similarity between x1 and x2 along axis.
+
+    Parameters:
+        axis (int): Dimension of vectors to compute cosine similarity. Default is 1.
+        eps(float): Small value to avoid division by zero. Default is 1e-8.
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            Case 0:
+                x1 = [[0.8024077  0.9927354  0.27238318 0.8344984 ]
+                     [0.48949873 0.5797396  0.65444374 0.66510963]
+                     [0.1031398  0.9614342  0.08365563 0.6796464 ]
+                     [0.10760343 0.7461209  0.7726148  0.5801006 ]]
+                x2 = [[0.62913156 0.1536727  0.9847992  0.04591406]
+                     [0.9098952  0.15715368 0.8671125  0.3156102 ]
+                     [0.4427798  0.54136837 0.5276275  0.32394758]
+                     [0.3769419  0.8535014  0.48041078 0.9256797 ]]
+                axis = 1
+                eps = 1e-8
+                Out: [0.5275037  0.8368967  0.75037485 0.9245899]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            np.random.seed(0)
+            x1 = np.random.rand(2,3)
+            x2 = np.random.rand(2,3)
+            x1 = paddle.to_tensor(x1)
+            x2 = paddle.to_tensor(x2)
+
+            cos_sim_func = nn.CosineSimilarity(axis=0)
+            result = cos_sim_func(x1, x2)
+            print(result.numpy())
+            # [0.99806249 0.9817672  0.94987036]
+    """
+
+    def __init__(self, axis=1, eps=1e-8):
+        super(CosineSimilarity, self).__init__()
+        self._axis = axis
+        self._eps = eps
+
+    def forward(self, x1, x2):
+        return F.cosine_similarity(x1, x2, axis=self._axis, eps=self._eps)
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 9fb6c9ebc2e404ab477630aae99a6b43d683b20b..7d0e59fb7575c9d15d28e88a462aed4ddba47fb9 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -15,12 +15,12 @@
 # TODO: define classes of convolutional neural network
 
 __all__ = [
-    'Conv2D',
-    'Conv2DTranspose',
-    'Conv3D',
-    'Conv3DTranspose',
-    #       'TreeConv',
-    #       'Conv1D'
+    'Conv1d',
+    'Conv2d',
+    'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
 ]
 
 import numpy as np
@@ -38,12 +38,270 @@ def _get_default_param_initializer(num_channels, filter_size):
     return Normal(0.0, std, 0)
 
 
-class Conv2D(layers.Layer):
+def _reverse_repeat_list(t, n):
+    """Reverse the order of `t` and repeat each element for `n` times.
+    This can be used to translate padding arg used by Conv and Pooling modules
+    to the ones used by `F.pad`.
     """
-	:alias_main: paddle.nn.Conv2D
-	:alias: paddle.nn.Conv2D,paddle.nn.layer.Conv2D,paddle.nn.layer.conv.Conv2D
+    return list(x for x in reversed(t) for _ in range(n))
 
-    This interface is used to construct a callable object of the ``Conv2D`` class.
+
+class _ConvNd(layers.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 transposed,
+                 dims,
+                 stride=1,
+                 padding=0,
+                 padding_mode='zeros',
+                 output_padding=0,
+                 dilation=1,
+                 groups=1,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCHW"):
+        super(_ConvNd, self).__init__()
+        assert weight_attr is not False, "weight_attr should not be False in Conv."
+        self._param_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._groups = groups
+        self._in_channels = in_channels
+        self._out_channels = out_channels
+        self._data_format = data_format
+
+        valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
+        if padding_mode not in valid_padding_modes:
+            raise ValueError(
+                "padding_mode must be one of {}, but got padding_mode='{}'".
+                format(valid_padding_modes, padding_mode))
+
+        if padding_mode in {'reflect', 'replicate', 'circular'
+                            } and not isinstance(padding, np.int):
+            raise TypeError(
+                "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int"
+            )
+
+        self._stride = utils.convert_to_list(stride, dims, 'stride')
+        self._dilation = utils.convert_to_list(dilation, dims, 'dilation')
+        self._kernel_size = utils.convert_to_list(kernel_size, dims,
+                                                  'kernel_size')
+        self._padding = padding
+        self._padding_mode = padding_mode
+        self.output_padding = output_padding
+
+        if transposed:
+            filter_shape = [self._in_channels, out_channels // groups
+                            ] + self._kernel_size
+        else:
+            if in_channels % groups != 0:
+                raise ValueError("in_channels must be divisible by groups.")
+
+            if padding_mode in {'reflect', 'replicate', 'circular'}:
+                _paired_padding = utils.convert_to_list(padding, 2, 'padding')
+                self._reversed_padding_repeated_twice = _reverse_repeat_list(
+                    _paired_padding, 2)
+
+            filter_shape = [out_channels, in_channels // groups
+                            ] + self._kernel_size
+
+        self.weight = self.create_parameter(
+            shape=filter_shape, attr=self._param_attr)
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=[self._out_channels], is_bias=True)
+
+
+class Conv1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``Conv1d`` class.
+    For more details, refer to code examples.
+    The convolution1D layer calculates the output based on the input, filter
+    and stride, padding, dilation, groups parameters. Input and
+    Output are in NCL format or NLC format, where N is batch size, C is the number of
+    the feature map, L is the length of the feature map.
+    Filter's shape is [MCK] , where M is the number of output feature map,
+    C is the number of input feature map, K is the size of the kernel. 
+    If the groups is greater than 1, C will equal the number of input feature map divided by the groups.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+    For each input :math:`X`, the equation is:
+    .. math::
+        Out = \\sigma (W \\ast X + b)
+    Where:
+    * :math:`X`: Input value, a ``Tensor`` with 'NCL' format or 'NLC' format.
+    * :math:`W`: Filter value, a ``Tensor`` with shape [MCK] .
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    Example:
+        - Input:
+          Input shape: :math:`(N, C_{in}, L_{in})`
+          Kernel shape: :math:`(C_{out}, C_{in}, K)`
+        - Output:
+          Output shape: :math:`(N, C_{out}, L_{out})`
+        Where
+        .. math::
+            L_{out}&= \\frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1
+    Parameters:
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of filter. It is as same as the output
+            feature map.
+        kernel_size (int|tuple|list): The filter size. If kernel_size is a tuple,
+            it must contain one integer, (kernel_size).
+        stride (int|tuple|list, optional): The stride size. If stride is a tuple, it must
+            contain one integer, (stride_size). Default: 1.
+        padding(int|str|tuple|list, optional): The size of zeros to be padded. It must be in one of the following forms.
+            1. a string in ['valid', 'same'].
+            2. an int, which means the feature map is zero paded by size of `padding` on both sides.
+            3. a list[int] or tuple[int] whose length is 1, which means the feature map is zero paded by size of `padding[0]` on both sides.
+            The default value is 0.
+        dilation (int|tuple|list, optional): The dilation size. If dilation is a tuple, it must
+            contain one integer, (dilation_size). Default: 1.
+        groups (int, optional): The groups number of the conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: 1.
+        padding_mode(str, optional): Four modes: 'zeros', 'reflect', 'replicate', 'circular'.
+            When in 'zeros' mode, this op uses zeros to pad the input tensor.
+            When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
+            When in 'replicate' mode, uses input boundaries to pad the input tensor.
+            When in 'circular' mode, uses circular input to pad the input tensor.
+            Default is 'zeros'.
+        bias(bool, optional): Whether to use bias. Default: True.
+        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+            of conv1d. If it is set to None or one attribute of ParamAttr, conv1d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv1d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv1d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+    Attribute:
+        **weight** (Parameter): the learnable weights of filter of this layer.
+        **bias** (Parameter or None): the learnable bias of this layer.
+    Shape:
+        - x: 3-D tensor with shape: (batch, in_channels, length) or (batch, length, in_channels).
+        - output: 3-D tensor with same shape as input x.
+    
+    Raises:
+        None
+    Examples:
+        .. code-block:: python
+          import paddle
+          from paddle.nn import Conv1d
+          import numpy as np
+          x = np.array([[[4, 8, 1, 9],
+            [7, 2, 0, 9],
+            [6, 9, 2, 6]]]).astype(np.float32)
+          w=np.array(
+          [[[9, 3, 4],
+            [0, 0, 7],
+            [2, 5, 6]],
+           [[0, 3, 4],
+            [2, 9, 7],
+            [5, 6, 8]]]).astype(np.float32)
+          paddle.disable_static()
+          x_t = paddle.to_tensor(x)
+          conv = Conv1d(3, 2, 3)
+          conv.weight.set_value(w)
+          y_t = conv(x_t)
+          y_np = y_t.numpy()
+          print(y_np)
+          # [[[133. 238.]
+          #   [160. 211.]]]
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 bias=True,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCL",
+                 name=None):
+        super(Conv1d, self).__init__()
+        assert weight_attr is not False, "param_attr should not be False here."
+        self._in_channels = in_channels
+        self._out_channels = out_channels
+        self._groups = groups
+        if in_channels % groups != 0:
+            raise ValueError("in_channels must be divisible by groups.")
+        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
+        self._stride = utils.convert_to_list(stride, 1, 'stride')
+        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
+        self._padding = padding  # leave it to F.conv1d
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._data_format = data_format
+        self._name = name
+
+        self._padding_mode = padding_mode
+
+        valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
+        if padding_mode not in valid_padding_modes:
+            raise ValueError(
+                "padding_mode must be one of {}, but got padding_mode='{}'".
+                format(valid_padding_modes, padding_mode))
+
+        if padding_mode in {'reflect', 'replicate', 'circular'
+                            } and not isinstance(padding, np.int):
+            raise ValueError(
+                "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int"
+            )
+        if not isinstance(padding, str):
+            self._padding = utils.convert_to_list(padding, 1, 'padding') * 2
+
+        num_filter_channels = in_channels // groups
+        filter_shape = [self._out_channels, num_filter_channels
+                        ] + self._kernel_size
+
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=filter_shape,
+            default_initializer=_get_default_param_initializer(
+                self._in_channels, filter_shape))
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=[self._out_channels],
+            is_bias=True) if bias else None
+
+    def forward(self, x):
+        padding = 0
+        if self._padding_mode != "zeros":
+            x = F.pad(x,
+                      self._padding,
+                      mode=self._padding_mode,
+                      data_format=self._data_format)
+        else:
+            padding = self._padding
+
+        out = F.conv1d(
+            x,
+            self.weight,
+            bias=self.bias,
+            padding=padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format,
+            name=self._name)
+        return out
+
+
+class Conv2d(_ConvNd):
+    """
+    This interface is used to construct a callable object of the ``Conv2d`` class.
     For more details, refer to code examples.
     The convolution2D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input and
@@ -59,48 +317,23 @@ class Conv2D(layers.Layer):
     If bias attribution and activation type are provided, bias is added to the
     output of the convolution, and the corresponding activation function is
     applied to the final result.
-
     For each input :math:`X`, the equation is:
-
     .. math::
-
         Out = \\sigma (W \\ast X + b)
-
     Where:
-
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
-
     Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of filter. It is as same as the output
-            feature map.
-        filter_size (int or tuple): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of channels produced by convolution.
+        kernel_size (int|list|tuple): The size of convolution kernel.
+        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`on both sides 
@@ -108,10 +341,8 @@ class Conv2D(layers.Layer):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        stride (int or tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
-        dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` .
+        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
         groups (int, optional): The groups number of the Conv2d Layer. According to grouped
@@ -119,129 +350,287 @@ class Conv2D(layers.Layer):
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
             and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d.
+        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
-        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
         data_format (str, optional): Data format that specifies the layout of input.
             It can be "NCHW" or "NHWC". Default: "NCHW".
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
     Attribute:
         **weight** (Parameter): the learnable weights of filter of this layer.
-
         **bias** (Parameter or None): the learnable bias of this layer.
-
-    Returns:
-        None
-    
-    Raises:
-        ValueError: if ``use_cudnn`` is not a bool value.
-
+    Shape:
+        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel_size[0] - 1) + 1))}{strides[0]} + 1 \\\\
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel_size[1] - 1) + 1))}{strides[1]} + 1
     Examples:
         .. code-block:: python
-
           import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-
+          import paddle
+          import paddle.nn as nn
           x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv2D(4, 6, (3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
+          
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          conv = nn.Conv2d(4, 6, (3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
           
           # (2, 6, 6, 6)
     """
 
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 padding=0,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  stride=1,
+                 padding=0,
                  dilation=1,
                  groups=1,
-                 param_attr=None,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCHW"):
+        super(Conv2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            2,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x):
+        if self._padding_mode != 'zeros':
+            x = F.pad(x,
+                      self._reversed_padding_repeated_twice,
+                      mode=self._padding_mode,
+                      data_format=self._data_format)
+            return F.conv2d(
+                x,
+                self.weight,
+                bias=self.bias,
+                stride=self._stride,
+                dilation=self._dilation,
+                groups=self._groups,
+                data_format=self._data_format)
+
+        out = F.conv2d(
+            x,
+            self.weight,
+            bias=self.bias,
+            padding=self._padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format)
+        return out
+
+
+class ConvTranspose1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ConvTranspose1d`` class.
+    For more details, refer to code examples.
+    The 1-D convolution transpose layer calculates the output based on the input,
+    filter, and dilation, stride, padding. Input(Input) and output(Output)
+    are in 'NCL' format or 'NLC' where N is batch size, C is the number of channels,
+    L is the length of the feature. The details of convolution transpose
+    layer, please refer to the following explanation and references
+    `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a 3-D Tensor with 'NCL' format or 'NLC' format.
+    * :math:`W`: Kernel value, a 3-D Tensor with 'MCK' format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, a 3-D Tensor with data format 'NCL' of 'NLC', the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, L_{in})`
+
+          Filter shape: :math:`(C_{in}, C_{out}, L_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, L_{out})`
+
+        Where
+
+        .. math::
+
+           L^\prime_{out} &= (L_{in} - 1) * stride - pad_top - pad_bottom + dilation * (L_f - 1) + 1 \\\\
+           L_{out} &\in [ L^\prime_{out}, L^\prime_{out} + stride ]
+
+    Note:
+          The conv1d_transpose can be seen as the backward of the conv1d. For conv1d,
+          when stride > 1, conv1d maps multiple input shape to the same output shape,
+          so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
+          If output_size is None, :math:`L_{out} = L^\prime_{out}`;
+          else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
+          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+
+    Args:
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of the filter. It is as same as the output
+            feature map.
+        kernel_size(int|tuple|list, optional): The filter size. If kernel_size is a tuple,
+            it must contain one integers, (kernel_size). None if
+            use output size to calculate kernel_size. Default: None. kernel_size and
+            output_size should not be None at the same time.
+        stride(int|tuple|list, optional): The stride size. It means the stride in transposed convolution.
+            If stride is a tuple, it must contain one integer, (stride_size).
+            Default: stride = 1.
+        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
+             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
+             string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
+             If `padding` is a tuple or list, it could be in two forms:
+             `[pad]` or `[pad_left, pad_right]`. Default: padding = 0.
+        output_padding(int|list|tuple, optional): The count of zeros to be added to tail of each dimension.
+             If it is a tuple, it must contain one integer. Default: 0.
+        groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups = 1.
+        bias(bool, optional): Whether to use bias. Default: True.
+        dilation(int|tuple|list, optional): The dilation size. It means the spacing between the kernel points.
+            If dilation is a tuple, it must contain one integer, (dilation_size).
+            Default: dilation = 1.
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+            of conv1d_transpose. If it is set to None or one attribute of ParamAttr, conv1d_transpose
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv1d_transpose.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv1d_transpose
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+
+    Attribute:
+        **weight** (Parameter): the learnable weights of filters of this layer.
+        **bias** (Parameter or None): the learnable bias of this layer.
+
+    Shape:
+        - x(Tensor): 3-D tensor with shape (batch, in_channels, length) when data_format is
+            "NCL" or shape (batch, length, in_channels) when data_format is "NLC".
+        - output_size(int|tuple|list, optional): The output image size. If output size is a
+            tuple, it must contain one integer, (feature_length). None if use
+            kernel_size, padding, output_padding and stride to calculate output_size.
+            If output_size and kernel_size are specified at the same time, They
+            should follow the formula above. Default: None. output_size and kernel_size
+            should not be None at the same time.
+        - output(Tensor): 3-D tensor with same shape as input x.
+
+    Examples:
+       .. code-block:: python
+
+          import paddle
+          from paddle.nn import ConvTranspose1d
+          import numpy as np
+          
+          paddle.disable_static()
+          # shape: (1, 2, 4)
+          x=np.array([[[4, 0, 9, 7],
+                       [8, 0, 9, 2]]]).astype(np.float32)
+          # shape: (2, 1, 2)
+          y=np.array([[[7, 0]],
+                      [[4, 2]]]).astype(np.float32)
+          x_t = paddle.to_tensor(x)
+          conv = ConvTranspose1d(2, 1, 2)
+          conv.weight.set_value(y)
+          y_t = conv(x_t)
+          y_np = y_t.numpy()
+          print y_np
+          
+          # [[[60. 16. 99. 75.  4.]]]
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 output_padding=0,
+                 groups=1,
+                 bias=True,
+                 dilation=1,
+                 weight_attr=None,
                  bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCHW",
-                 dtype='float32'):
-        super(Conv2D, self).__init__()
-        assert param_attr is not False, "param_attr should not be False here."
-        self._num_channels = num_channels
-        self._num_filters = num_filters
+                 data_format="NCL"):
+        super(ConvTranspose1d, self).__init__()
+        assert weight_attr is not False, "param_attr should not be False in ConvTranspose1d."
+        self._param_attr = weight_attr
+        self._bias_attr = bias_attr
         self._groups = groups
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        self._act = act
+        self._in_channels = in_channels
+        self._out_channels = out_channels
+        self._output_padding = output_padding
         self._data_format = data_format
-        self._dtype = dtype
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        self._use_cudnn = use_cudnn
-
-        self._filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
-        self._stride = utils.convert_to_list(stride, 2, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
-        channel_last = (data_format == "NHWC")
-        self._padding = padding  # leave it to F.conv2d
-
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
+        self._bias = bias
 
-        num_filter_channels = num_channels // groups
-        filter_shape = [self._num_filters, num_filter_channels
-                        ] + self._filter_size
+        self._stride = utils.convert_to_list(stride, 1, 'stride')
+        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
+        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
+        self._padding = padding
 
+        filter_shape = [self._in_channels, out_channels // groups
+                        ] + self._kernel_size
         self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer(
-                self._num_channels, filter_shape))
+            shape=filter_shape, attr=self._param_attr)
         self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
+            attr=self._bias_attr, shape=[self._out_channels],
+            is_bias=True) if self._bias else None
 
-    def forward(self, input):
-        out = F.conv2d(
-            input,
+    def forward(self, x, output_size=None):
+        out = F.conv_transpose1d(
+            x,
             self.weight,
             bias=self.bias,
+            output_size=output_size,
+            output_padding=self._output_padding,
             padding=self._padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
             data_format=self._data_format)
         return out
 
 
-class Conv2DTranspose(layers.Layer):
+class ConvTranspose2d(_ConvNd):
     """
-	:alias_main: paddle.nn.Conv2DTranspose
-	:alias: paddle.nn.Conv2DTranspose,paddle.nn.layer.Conv2DTranspose,paddle.nn.layer.conv.Conv2DTranspose
-
-    This interface is used to construct a callable object of the ``Conv2DTranspose`` class.
+    This interface is used to construct a callable object of the ``ConvTranspose2d`` class.
     For more details, refer to code examples.
     The convolution2D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input and output
@@ -256,55 +645,36 @@ class Conv2DTranspose(layers.Layer):
     is applied to the final result.
     The details of convolution transpose layer, please refer to the following explanation and references
     `conv2dtranspose <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_ .
-
     For each input :math:`X`, the equation is:
-
     .. math::
-
         Out = \sigma (W \\ast X + b)
-
     Where:
-
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
     Example:
-
         - Input:
-
           Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
           Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-
         - Output:
-
           Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
         Where
-
         .. math::
-
            H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
            W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
            H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
            W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
-
     Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of the filter. It is as same as the output
-            feature map.
-        filter_size(int or tuple): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        output_size(int or tuple, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of channels produced by the convolution.
+        kernel_size(int|list|uple): The kernel size. If kernel_size is a tuple,
+            it must contain two integers, (kernel_size_H, kernel_size_W).
+            Otherwise, the kernel will be a square.
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides 
@@ -312,10 +682,10 @@ class Conv2DTranspose(layers.Layer):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        stride(int or tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
             contain two integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. Default: 1.
-        dilation(int or tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
         groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
@@ -324,125 +694,94 @@ class Conv2DTranspose(layers.Layer):
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d_transpose.
+        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d_transpose.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
         data_format (str, optional): Data format that specifies the layout of input.
             It can be "NCHW" or "NHWC". Default: "NCHW".
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
     Attribute:
         **weight** (Parameter): the learnable weights of filters of this layer.
-
         **bias** (Parameter or None): the learnable bias of this layer.
-
-    Returns:
-        None
-
+    Shape:
+        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
+           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
     Examples:
        .. code-block:: python
-
           import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-
+          import paddle
+          import paddle.nn as nn
           x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv2DTranspose(4, 6, (3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          conv = nn.ConvTranspose2d(4, 6, (3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
           
           # (2, 6, 10, 10)
     """
 
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 output_size=None,
-                 padding=0,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  stride=1,
+                 padding=0,
+                 output_padding=0,
                  dilation=1,
                  groups=1,
-                 param_attr=None,
+                 weight_attr=None,
                  bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCHW",
-                 dtype='float32'):
-        super(Conv2DTranspose, self).__init__()
-        assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self._groups = groups
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._use_cudnn = use_cudnn
-        self._data_format = data_format
-        self._dtype = dtype
-
-        self._stride = utils.convert_to_list(stride, 2, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
-        self._filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
+                 data_format="NCHW"):
+        super(ConvTranspose2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            2,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x, output_size=None):
         if output_size is None:
-            self._output_size = output_size
-        elif isinstance(output_size, (list, tuple, int)):
-            self._output_size = utils.convert_to_list(output_size, 2,
-                                                      'output_size')
+            output_padding = self.output_padding
         else:
-            raise ValueError(
-                "output_size should be int, ot list[int] or tuple[int]")
-        self._padding = padding
+            output_padding = 0
 
-        filter_shape = [self._num_channels, num_filters // groups
-                        ] + self._filter_size
-        self.weight = self.create_parameter(
-            dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        out = F.conv2d_transpose(
-            input,
+        out = F.conv_transpose2d(
+            x,
             self.weight,
             bias=self.bias,
-            output_size=self._output_size,
             padding=self._padding,
+            output_padding=output_padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
+            output_size=output_size,
             data_format=self._data_format)
         return out
 
 
-class Conv3D(layers.Layer):
+class Conv3d(_ConvNd):
     """
-	:alias_main: paddle.nn.Conv3D
-	:alias: paddle.nn.Conv3D,paddle.nn.layer.Conv3D,paddle.nn.layer.conv.Conv3D
-
-    **Convlution3D Layer**
-
-    The convolution3D layer calculates the output based on the input, filter
+    **Convlution3d Layer**
+    The convolution3d layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
     Output(Output) are multidimensional tensors with a shape of 
     :math:`[N, C, D, H, W]` . Where N is batch size, C is the number of
@@ -451,49 +790,21 @@ class Conv3D(layers.Layer):
     but adds one dimension(depth). If bias attribution and activation type are
     provided, bias is added to the output of the convolution, and the
     corresponding activation function is applied to the final result.
-
     For each input :math:`X`, the equation is:
-
     .. math::
-
         Out = \sigma (W \\ast X + b)
-
     In the above equation:
-
     * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
-
-        - Output:
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
-
     Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of filter. It is as same as the output image channel.
-        filter_size (int|tuple, optional): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square, filter_size_depth = filter_size_height
-            = filter_size_width = filter_size.
-        stride (int|tuple, optional): The stride size. If stride is a tuple, it must
+        in_channels(int): The number of input channels in the input image.
+        out_channels(int): The number of output channels produced by the convolution.
+        kernel_size (int|list|tuple, optional): The size of the convolving kernel.
+        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
             contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
             stride_D = stride_H = stride_W = stride. The default value is 1.
         padding (int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -503,7 +814,7 @@ class Conv3D(layers.Layer):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        dilation (int|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         groups (int, optional): The groups number of the Conv3d Layer. According to grouped
@@ -511,7 +822,8 @@ class Conv3D(layers.Layer):
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. The default value is 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
@@ -521,120 +833,97 @@ class Conv3D(layers.Layer):
             If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. The default value is None.
-        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. The default value is True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            The default value is None.
         data_format (str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
     Attribute:
         **weight** (Parameter): the learnable weights of filters of this layer.
-
         **bias** (Parameter): the learnable bias of this layer.
-
-    Returns:
-        None.
-
+    Shape:
+        - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
                     groups mismatch.
-
     Examples:
         .. code-block:: python
-
           import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-
+          
+          import paddle
+          import paddle.nn as nn
           x = np.random.uniform(-1, 1, (2, 4, 8, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv3D(4, 6, (3, 3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
+          
+          paddle.disable_static()
+          x_var = dg.to_variable(x)
+          conv = nn.Conv3d(4, 6, (3, 3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
           
           # (2, 6, 6, 6, 6)
     """
 
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  padding=0,
                  stride=1,
                  dilation=1,
                  groups=1,
-                 param_attr=None,
+                 padding_mode='zeros',
+                 weight_attr=None,
                  bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCDHW",
-                 dtype='float32'):
-        super(Conv3D, self).__init__()
-        assert param_attr is not False, "param_attr should not be False here."
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._groups = groups
-        self._act = act
-        self._use_cudnn = use_cudnn
-        self._dtype = dtype
-        self._data_format = data_format
-
-        self._stride = utils.convert_to_list(stride, 3, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
-        self._filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
-        channel_last = (data_format == "NDHWC")
-        self._padding = padding
-
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels // groups
-
-        filter_shape = [num_filters, num_filter_channels] + self._filter_size
-
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer(
-                self._num_channels, self._filter_size))
-
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
+                 data_format="NCDHW"):
+        super(Conv3d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            3,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x):
+        if self._padding_mode != 'zeros':
+            x = F.pad(x,
+                      self._reversed_padding_repeated_twice,
+                      mode=self._padding_mode,
+                      data_format=self._data_format)
+            return F.conv3d(
+                x,
+                self.weight,
+                bias=self.bias,
+                stride=self._stride,
+                dilation=self._dilation,
+                groups=self._groups,
+                data_format=self._data_format)
 
-    def forward(self, input):
         out = F.conv3d(
-            input,
+            x,
             self.weight,
             bias=self.bias,
             padding=self._padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
             data_format=self._data_format)
         return out
 
 
-class Conv3DTranspose(layers.Layer):
+class ConvTranspose3d(_ConvNd):
     """
-	:alias_main: paddle.nn.Conv3DTranspose
-	:alias: paddle.nn.Conv3DTranspose,paddle.nn.layer.Conv3DTranspose,paddle.nn.layer.conv.Conv3DTranspose
-
     **Convlution3D transpose layer**
-
     The convolution3D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
     are in NCDHW format. Where N is batch size, C is the number of channels,
@@ -646,70 +935,38 @@ class Conv3DTranspose(layers.Layer):
     If bias attribution and activation type are provided, bias is added to
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
-
     For each input :math:`X`, the equation is:
-
     .. math::
-
         Out = \sigma (W \\ast X + b)
-
     In the above equation:
-
     * :math:`X`: Input value, a tensor with NCDHW format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
     Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
-           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 \\\\
-           D_{out} &\in [ D^\prime_{out}, D^\prime_{out} + strides[0] ] \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[1] ] \\\\
-
     **Note**:
-
-          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d, 
+          The conv_transpose3d can be seen as the backward of the conv3d. For conv3d, 
           when stride > 1, conv3d maps multiple input shape to the same output shape, 
-          so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
+          so for conv_transpose3d, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
           H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output 
           size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
           the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
           and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
           between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
-          conv3d_transpose can compute the kernel size automatically.
-
-
+          conv_transpose3d can compute the kernel size automatically.
     Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of the filter. It is as same as the output
-            image channel.
-        filter_size(int|tuple): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        output_size(int or tuple, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of channels produced by the convolution.
+        kernel_size(int|list|tuple): The kernel size. If kernel_size is a tuple,
+            it must contain three integers, (kernel_size_D, kernel_size_H, kernel_size_W).
+            Otherwise, the kernel will be a square.
+        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
+            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
+            The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
@@ -717,11 +974,9 @@ class Conv3DTranspose(layers.Layer):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
-            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
-            The default value is 1.
-        dilation(int|tuple, optional): The dilation size. If dilation is a tuple, it must
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
@@ -730,7 +985,7 @@ class Conv3DTranspose(layers.Layer):
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             The default value is 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. The default value is None.
@@ -739,109 +994,86 @@ class Conv3DTranspose(layers.Layer):
             If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. The default value is None.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. The default value is True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            The default value is None.
+        output_size(int|list|tuple, optional): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). None if use
+            filter_size, padding, and stride to calculate output_size.
+            if output_size and filter_size are specified at the same time, They
+            should follow the formula above. Default: None.
         data_format (str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
-
     Attribute:
         **weight** (Parameter): the learnable weights of filters of this layer.
-
         **bias** (Parameter): the learnable bias of this layer.
-
-    Returns:
-        None.
-
+    Shape:
+        - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
+           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
+           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel_size[2] - 1) + 1 \\\\
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
                     groups mismatch.
-
     Examples:
        .. code-block:: python
-
           import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-
+          import paddle
+          import paddle.nn as nn
           x = np.random.uniform(-1, 1, (2, 4, 8, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
+          
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
           
           # (2, 6, 10, 10, 10)
     """
 
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 output_size=None,
-                 padding=0,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  stride=1,
+                 padding=0,
+                 output_padding=0,
                  dilation=1,
                  groups=1,
-                 param_attr=None,
+                 weight_attr=None,
                  bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCDHW",
-                 dtype='float32'):
-        super(Conv3DTranspose, self).__init__()
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        assert param_attr is not False, "param_attr should not be False in conv3d_transpose."
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._groups = groups
-        self._use_cudnn = use_cudnn
-        self._act = act
-        self._dtype = dtype
-        self._data_format = data_format
-
-        self._stride = utils.convert_to_list(stride, 3, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
-        self._filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
-        channel_last = (data_format == "NDHWC")
-        self._padding = padding
+                 data_format="NCDHW"):
+        super(ConvTranspose3d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            3,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x, output_size):
         if output_size is None:
-            self._output_size = output_size
-        elif isinstance(output_size, (list, tuple, int)):
-            self._output_size = utils.convert_to_list(output_size, 3,
-                                                      'output_size')
+            output_padding = self.output_padding
         else:
-            raise ValueError(
-                "output_size should be int, ot list[int] or tuple[int]")
-
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
+            output_padding = 0
 
-        filter_shape = [num_channels, num_filters // groups] + self._filter_size
-        self.weight = self.create_parameter(
-            dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        out = F.conv3d_transpose(
-            input,
+        out = F.conv_transpose3d(
+            x,
             self.weight,
             bias=self.bias,
-            output_size=self._output_size,
             padding=self._padding,
+            output_padding=output_padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
+            output_size=output_size,
             data_format=self._data_format)
         return out
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 5067264ee792dd642e22a3ecde6eb7c1264d3875..de10e77eb1c000e66a7a914dc94ce39a6268bb61 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -12,25 +12,133 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define loss functions of neural network  
+# TODO: define loss functions of neural network
 import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle
 from .. import functional as F
+from paddle.fluid.framework import core, in_dygraph_mode, _varbase_creator
 
 __all__ = [
-    #       'NCELoss',
+    'BCEWithLogitsLoss',
     'CrossEntropyLoss',
     'MSELoss',
     'L1Loss',
     'NLLLoss',
     'BCELoss',
     'KLDivLoss',
-    'MarginRankingLoss'
+    'MarginRankingLoss',
+    'CTCLoss',
+    'SmoothL1Loss',
 ]
 
 
+class BCEWithLogitsLoss(fluid.dygraph.Layer):
+    """
+    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
+    layer and some reduce operations.
+
+    This measures the element-wise probability error in classification tasks
+    in which each class is independent.
+    This can be thought of as predicting labels for a data-point, where labels
+    are not mutually exclusive. For example, a news article can be about
+    politics, technology or sports at the same time or none of these.
+
+    First this operator calculate loss function as follows:
+
+    .. math::
+           Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit))
+
+    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get:
+
+    .. math::
+           Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit})
+
+    For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0,
+    we reformulate the loss as follows:
+
+    .. math::
+           Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|})
+
+    Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
+    weight tensor on the loss `Out`. The ``weight`` tensor will attach different
+    weight on every items in the batch. The ``pos_weight`` will attach different
+    weight on the positive label of each class.
+
+    Finally, this operator applies reduce operation on the loss.
+    If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
+
+    Note that the target labels ``label`` should be numbers between 0 and 1.
+
+    Args:
+        weight (Tensor, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, it has to be a 1D Tensor whose size is `[N, ]`,
+            The data type is float32, float64. Default is ``'None'``.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default is ``'mean'``.
+        pos_weight (Tensor, optional): A weight of positive examples. Must be a vector
+            with length equal to the number of classes. The data type is float32, float64.
+            Default is ``'None'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shapes:
+        logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
+            N is batch_size, `*` means number of additional dimensions. The ``logit``
+            is usually the output of Linear layer. Available dtype is float32, float64.
+        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+            ``logit``. The target labels which values should be numbers between 0 and 1.
+            Available dtype is float32, float64.
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+            same as ``logit`` , else the shape of output is scalar.
+
+    Returns:
+        A callable object of BCEWithLogitsLoss.
+
+    Examples:
+
+        .. code-block:: python
+            import paddle
+            paddle.disable_static()
+            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
+            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
+            bce_logit_loss = paddle.nn.BCEWithLogitsLoss()
+            output = bce_logit_loss(logit, label)
+            print(output.numpy())  # [0.45618808]
+
+    """
+
+    def __init__(self,
+                 weight=None,
+                 reduction='mean',
+                 pos_weight=None,
+                 name=None):
+        if reduction not in ['sum', 'mean', 'none']:
+            raise ValueError(
+                "The value of 'reduction' in BCEWithLogitsLoss should be 'sum', 'mean' or 'none', but "
+                "received %s, which is not allowed." % reduction)
+
+        super(BCEWithLogitsLoss, self).__init__()
+        self.weight = weight
+        self.reduction = reduction
+        self.pos_weight = pos_weight
+        self.name = name
+
+    def forward(self, logit, label):
+        out = paddle.nn.functional.binary_cross_entropy_with_logits(
+            logit, label, self.weight, self.reduction, self.pos_weight,
+            self.name)
+        return out
+
+
 class CrossEntropyLoss(fluid.dygraph.Layer):
     """
 	:alias_main: paddle.nn.CrossEntropyLoss
@@ -60,8 +168,8 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
     Parameters:
         input (Variable): Input tensor, the data type is float32, float64. Shape is
 	    (N, C), where C is number of classes, and if shape is more than 2D, this
-	    is (N, C, D1, D2,..., Dk), k >= 1. 
-        label (Variable): Label tensor, the data type is int64. Shape is (N), where each 
+	    is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Variable): Label tensor, the data type is int64. Shape is (N), where each
 	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
 	    (N, D1, D2,..., Dk), k >= 1.
         weight (Variable, optional): Weight tensor, a manual rescaling weight given
@@ -117,7 +225,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
                 print(output.numpy())
     """
 
-    def __init__(self, weight=None, reduction='mean', ignore_index=-100):
+    def __init__(self, weight=None, ignore_index=-100, reduction='mean'):
         super(CrossEntropyLoss, self).__init__()
         self.weight = weight
         self.reduction = reduction
@@ -135,25 +243,16 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
                 " 'none', but received %s, which is not allowed." %
                 self.reduction)
 
-        log_softmax = paddle.nn.LogSoftmax()
-        log_softmax_out = log_softmax(input)
-        if self.weight is not None and not isinstance(self.weight,
-                                                      fluid.framework.Variable):
-            raise ValueError(
-                "The weight' is not a Variable, please convert to Variable.")
-        nll_loss = paddle.nn.loss.NLLLoss(
+        return paddle.nn.functional.cross_entropy(
+            input,
+            label,
             weight=self.weight,
-            reduction=self.reduction,
-            ignore_index=self.ignore_index)
-
-        return nll_loss(log_softmax_out, label)
+            ignore_index=self.ignore_index,
+            reduction=self.reduction)
 
 
 class MSELoss(fluid.dygraph.layers.Layer):
     """
-	:alias_main: paddle.nn.MSELoss
-	:alias: paddle.nn.MSELoss,paddle.nn.layer.MSELoss,paddle.nn.layer.loss.MSELoss
-
     **Mean Square Error Loss**
     Computes the mean square error (squared L2 norm) of given input and label.
 
@@ -175,55 +274,34 @@ class MSELoss(fluid.dygraph.layers.Layer):
     where `input` and `label` are `float32` tensors of same shape.
 
     Parameters:
-        input (Variable): Input tensor, the data type is float32,
-        label (Variable): Label tensor, the data type is float32,
         reduction (string, optional): The reduction method for the output,
             could be 'none' | 'mean' | 'sum'.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. 
-            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. 
-            If :attr:`reduction` is ``'none'``, the unreduced loss is returned. 
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned.
+            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
 
-    Returns:
-        The tensor variable storing the MSE loss of input and label.
-
-    Return type:
-        Variable.
+    Shape:
+        input (Tensor): Input tensor, the data type is float32 or float64
+        label (Tensor): Label tensor, the data type is float32 or float64
+        output (Tensor): output tensor storing the MSE loss of input and label, the data type is same as input.
 
     Examples:
         .. code-block:: python
 
             import numpy as np
             import paddle
-            from paddle import fluid
-            import paddle.fluid.dygraph as dg
 
-            mse_loss = paddle.nn.loss.MSELoss()
-            input = fluid.data(name="input", shape=[1])
-            label = fluid.data(name="label", shape=[1])
-            place = fluid.CPUPlace()
             input_data = np.array([1.5]).astype("float32")
             label_data = np.array([1.7]).astype("float32")
 
-            # declarative mode
-            output = mse_loss(input,label)
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            output_data = exe.run(
-                fluid.default_main_program(),
-                feed={"input":input_data, "label":label_data},
-                fetch_list=[output],
-                return_numpy=True)
-            print(output_data)
-            # [array([0.04000002], dtype=float32)]
-
-            # imperative mode
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                label = dg.to_variable(label_data)
-                output = mse_loss(input, label)
-                print(output.numpy())
-                # [0.04000002]
+            paddle.disable_static()
+            mse_loss = paddle.nn.loss.MSELoss()
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            output = mse_loss(input, label)
+            print(output.numpy())
+            # [0.04000002]
     """
 
     def __init__(self, reduction='mean'):
@@ -236,10 +314,10 @@ class MSELoss(fluid.dygraph.layers.Layer):
 
     def forward(self, input, label):
         if not fluid.framework.in_dygraph_mode():
-            fluid.data_feeder.check_variable_and_dtype(input, 'input',
-                                                       ['float32'], 'MSELoss')
-            fluid.data_feeder.check_variable_and_dtype(label, 'label',
-                                                       ['float32'], 'MSELoss')
+            fluid.data_feeder.check_variable_and_dtype(
+                input, 'input', ['float32', 'float64'], 'MSELoss')
+            fluid.data_feeder.check_variable_and_dtype(
+                label, 'label', ['float32', 'float64'], 'MSELoss')
 
         square_out = fluid.layers.square(
             fluid.layers.elementwise_sub(input, label))
@@ -273,23 +351,23 @@ class L1Loss(fluid.dygraph.Layer):
     .. math::
         Out = SUM(\lvert input - label\rvert)
 
-    
+
     Parameters:
-        reduction (str, optional): Indicate the reduction to apply to the loss, 
+        reduction (str, optional): Indicate the reduction to apply to the loss,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If `reduction` is ``'none'``, the unreduced loss is returned; 
-            If `reduction` is ``'mean'``, the reduced mean loss is returned. 
-            If `reduction` is ``'sum'``, the reduced sum loss is returned. 
+            If `reduction` is ``'none'``, the unreduced loss is returned;
+            If `reduction` is ``'mean'``, the reduced mean loss is returned.
+            If `reduction` is ``'sum'``, the reduced sum loss is returned.
             Default is ``'mean'``.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Shape:
         input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
         label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
-        output (Tensor): The L1 Loss of ``input`` and ``label``. 
+        output (Tensor): The L1 Loss of ``input`` and ``label``.
             If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
             If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
-            
+
     Examples:
         .. code-block:: python
             import paddle
@@ -303,17 +381,17 @@ class L1Loss(fluid.dygraph.Layer):
 
             l1_loss = paddle.nn.loss.L1Loss()
             output = l1_loss(input, label)
-            print(output.numpy())  
+            print(output.numpy())
             # [0.35]
 
             l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
             output = l1_loss(input, label)
-            print(output.numpy())  
+            print(output.numpy())
             # [1.4]
 
             l1_loss = paddle.nn.loss.L1Loss(reduction='none')
             output = l1_loss(input, label)
-            print(output.numpy())  
+            print(output.numpy())
             # [[0.20000005 0.19999999]
             # [0.2        0.79999995]]
     """
@@ -334,90 +412,80 @@ class L1Loss(fluid.dygraph.Layer):
 
 class BCELoss(fluid.dygraph.Layer):
     """
-	:alias_main: paddle.nn.BCELoss
-	:alias: paddle.nn.BCELoss,paddle.nn.layer.BCELoss,paddle.nn.layer.loss.BCELoss
-
     This interface is used to construct a callable object of the ``BCELoss`` class.
-    The BCELoss layer measures the binary_cross_entropy loss between input predictions 
-    and target labels. The binary_cross_entropy loss can be described as:
+    The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
+    and target labels ``label`` . The binary_cross_entropy loss can be described as:
 
     If :attr:`weight` is set, the loss is:
 
     .. math::
         Out = -1 * weight * (label * log(input) + (1 - label) * log(1 - input))
+
     If :attr:`weight` is None, the loss is:
 
     .. math::
         Out = -1 * (label * log(input) + (1 - label) * log(1 - input))
 
-    If :attr:`reduction` set to ``'none'``, the unreduced loss is:
+    If :attr:`reduction` set to ``'none'``, the interface will return the original loss `Out`.
 
-    .. math::
-        Out = Out
     If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
 
     .. math::
         Out = MEAN(Out)
+
     If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
 
     .. math::
         Out = SUM(Out)
 
-    Note that the input predictions always be the output of sigmoid, and the target labels 
+    Note that the input predictions ``input`` always be the output of sigmoid, and the target labels ``label``
     should be numbers between 0 and 1.
 
-    The shape of input predictions and target labels are [N, *], where N is batch_size and `*` 
-    means any number of additional dimensions. If ``reduction`` is ``'none'``, the shape of 
-    output is scalar, else the shape of output is same as input.
-
     Parameters:
-        weight (Variable, optional): A manual rescaling weight given to the loss of each 
-            batch element. If given, has to be a Variable of size nbatch and the data type
+        weight (Tensor, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, has to be a Tensor of size nbatch and the data type
             is float32, float64. Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss by batch_size, 
+        reduction (str, optional): Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; 
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`reduction` is ``'sum'``, the summed loss is returned.
             Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
-    Returns: 
+    Shape:
+        input (Tensor): 2-D tensor with shape: (N, *), N is batch_size, `*` means
+            number of additional dimensions. The input ``input`` should always
+            be the output of sigmod.  Available dtype is float32, float64.
+        label (Tensor): 2-D tensor with the same shape as ``input``. The target
+            labels which values should be numbers between 0 and 1. Available
+            dtype is float32, float64.
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+            same as ``input`` , else the shape of output is scalar.
+
+    Returns:
         A callable object of BCELoss.
 
     Examples:
         .. code-block:: python
 
-            # declarative mode
-            import paddle.fluid as fluid
             import numpy as np
             import paddle
-            input = fluid.data(name="input", shape=[3, 1], dtype='float32')
-            label = fluid.data(name="label", shape=[3, 1], dtype='float32')
-            bce_loss = paddle.nn.loss.BCELoss()
-            output = bce_loss(input, label)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-    
             input_data = np.array([0.5, 0.6, 0.7]).astype("float32")
             label_data = np.array([1.0, 0.0, 1.0]).astype("float32")
-            output_data = exe.run(fluid.default_main_program(),
-                    feed={"input":input_data, "label":label_data},
-                    fetch_list=[output],
-                    return_numpy=True)
-    
-            print(output_data)  # [array([0.65537095], dtype=float32)]
-            
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                label = dg.to_variable(label_data)
-                output = bce_loss(input, label)
-                print(output.numpy())  # [0.65537095]
+
+            paddle.disable_static()
+            input = paddle.to_variable(input_data)
+            label = paddle.to_variable(label_data)
+            bce_loss = paddle.nn.loss.BCELoss()
+            output = bce_loss(input, label)
+            print(output.numpy())  # [0.65537095]
+            paddle.enable_static()
+
     """
 
-    def __init__(self, weight=None, reduction='mean'):
+    def __init__(self, weight=None, reduction='mean', name=None):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in bce_loss should be 'sum', 'mean' or 'none', but "
@@ -426,38 +494,12 @@ class BCELoss(fluid.dygraph.Layer):
         super(BCELoss, self).__init__()
         self.weight = weight
         self.reduction = reduction
+        self.name = name
 
     def forward(self, input, label):
-        dtype = self._helper.input_dtype(input)
-
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'bce_loss')
-        fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['float32', 'float64'], 'bce_loss')
-
-        out = self._helper.create_variable_for_type_inference(dtype=input.dtype)
-        self._helper.append_op(
-            type='bce_loss',
-            inputs={
-                'X': [input],
-                'Label': [label],
-            },
-            outputs={'Out': [out]})
-
-        if self.weight is not None:
-            if isinstance(self.weight, fluid.framework.Variable):
-                w = self.weight
-                out = fluid.layers.elementwise_mul(out, w, axis=-1)
-            else:
-                raise ValueError(
-                    "The weight is not a Variable, please convert to Variable.")
-
-        if self.reduction == 'sum':
-            return fluid.layers.reduce_sum(out)
-        elif self.reduction == 'mean':
-            return fluid.layers.reduce_mean(out)
-        else:
-            return out
+        out = paddle.nn.functional.binary_cross_entropy(
+            input, label, self.weight, self.reduction, self.name)
+        return out
 
 
 class NLLLoss(fluid.dygraph.Layer):
@@ -467,18 +509,18 @@ class NLLLoss(fluid.dygraph.Layer):
 
     This class accepts input and target label and returns negative log likelihood
     cross error. It is useful to train a classification problem with C classes.
-     
+
     The input for the loss is epected to contain log-probabilities of
     each classes. It has to be a Tensor of size either (batch_size, C) or
     (batch_size, C, d1, d2, ..., dK) with K >= 1 for the K-dimensional case.
     The label for the loss should be a class index in the range [0, C-1]
     where C is the number of classes. If ignore_index is specified, the
     specified target value does not contribute to the input gradient.
-    
+
     If the optional argument `weight` is provided, it should be a 1D Tensor
     assigning weight to each of the classed. This is particularly useful
     when you have an unbalanced training set.
- 
+
     The loss is calculated as follows.
     The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
 
@@ -501,11 +543,11 @@ class NLLLoss(fluid.dygraph.Layer):
     Parameters:
         weight (Tensor, optional): Weight tensor, a manual rescaling weight given
             to each class. If given, it has to be a 1D Tensor whose size is `[C, ]`. Otherwise,
-            it treated as if having all ones. the data type is 
+            it treated as if having all ones. the data type is
             float32, float64, Default is ``'None'``.
         ignore_index (int64, optional): Specifies a target value that is ignored
             and does not contribute to the input gradient.
-        reduction (str, optional): Indicate how to average the loss, 
+        reduction (str, optional): Indicate how to average the loss,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If `reduction` is ``'mean'``, the reduced mean loss is returned;
             if `reduction` is ``'sum'``, the reduced sum loss is returned;
@@ -586,9 +628,9 @@ class KLDivLoss(fluid.dygraph.Layer):
     $$l(x, y) = y * (\log(y) - x)$$
 
     Parameters:
-        reduction (str, optional): Indicate how to average the loss, 
+        reduction (str, optional): Indicate how to average the loss,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; 
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             Default is ``'mean'``.
 
     Shape:
@@ -603,7 +645,7 @@ class KLDivLoss(fluid.dygraph.Layer):
             import paddle
             import numpy as np
             import paddle.nn as nn
-            
+
             paddle.enable_imperative()
 
             shape = (5, 20)
@@ -615,7 +657,7 @@ class KLDivLoss(fluid.dygraph.Layer):
             pred_loss = kldiv_criterion(paddle.to_variable(x),
                                         paddle.to_variable(target))
             # shape=[5]
-            
+
             # 'mean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='mean')
             pred_loss = kldiv_criterion(paddle.to_variable(x),
@@ -648,10 +690,10 @@ class MarginRankingLoss(fluid.dygraph.Layer):
     """
 
     This interface is used to construct a callable object of the ``MarginRankingLoss`` class.
-    The MarginRankingLoss layer calculates the margin rank loss between the input, other and label 
+    The MarginRankingLoss layer calculates the margin rank loss between the input, other and label
     , use the math function as follows.
 
-    .. math:: 
+    .. math::
         margin\_rank\_loss = max(0, -label * (input - other) + margin)
 
     If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
@@ -671,7 +713,7 @@ class MarginRankingLoss(fluid.dygraph.Layer):
         reduction (str, optional): Indicate the reduction to apply to the loss, the candicates are ``'none'``, ``'mean'``, ``'sum'``.If :attr:`reduction` is ``'none'``, the unreduced loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. Default is ``'mean'``.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-    Shape: 
+    Shape:
         input: N-D Tensor, the shape is [N, *], N is batch size and `*` means any number of additional dimensions., available dtype is float32, float64.
         other: N-D Tensor, `other` have the same shape and dtype as `input`.
         label: N-D Tensor, label have the same shape and dtype as `input`.
@@ -684,23 +726,23 @@ class MarginRankingLoss(fluid.dygraph.Layer):
 
         .. code-block:: python
 
-            import numpy as np 
-            import paddle 
-            
+            import numpy as np
+            import paddle
+
             paddle.disable_static()
-             
+
             input = paddle.to_variable(np.array([[1, 2], [3, 4]]).astype("float32"))
             other = paddle.to_variable(np.array([[2, 1], [2, 4]]).astype("float32"))
             label = paddle.to_variable(np.array([[1, -1], [-1, -1]]).astype("float32"))
             margin_rank_loss = paddle.nn.MarginRankingLoss()
-            loss = margin_rank_loss(input, other, label) 
+            loss = margin_rank_loss(input, other, label)
             print(loss.numpy()) # [0.75]
     """
 
     def __init__(self, margin=0.0, reduction='mean', name=None):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
-                "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
+                "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
                 "received %s, which is not allowed." % reduction)
         super(MarginRankingLoss, self).__init__()
         self.margin = margin
@@ -711,3 +753,167 @@ class MarginRankingLoss(fluid.dygraph.Layer):
         out = paddle.nn.functional.margin_ranking_loss(
             input, other, label, self.margin, self.reduction, self.name)
         return out
+
+
+class CTCLoss(fluid.dygraph.Layer):
+    """
+	:alias_main: paddle.nn.CTCLoss
+	:alias: paddle.nn.CTCLoss, paddle.nn.layer.CTCLoss, paddle.nn.layer.loss.CTCLoss
+
+    An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
+    to compute Connectionist Temporal Classification (CTC) loss.
+    It can be aliased as softmax with CTC, since a native softmax activation
+    is interated to the Warp-CTC library to normalize values for each row of the input tensor.
+
+    Parameters:
+        blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
+        reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
+
+    Shape:
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
+        input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
+        label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
+
+    Returns:
+        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
+
+    Examples:
+
+        .. code-block:: python
+
+            # declarative mode
+            import numpy as np
+            import paddle
+
+            # length of the longest logit sequence
+            max_seq_length = 4
+            #length of the longest label sequence
+            max_label_length = 3
+            # number of logit sequences
+            batch_size = 2
+            # class num
+            class_num = 3
+
+            np.random.seed(1)
+            log_probs = np.array([[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
+                                    [3.02332580e-01, 1.46755889e-01, 9.23385918e-02]],
+
+                                    [[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
+                                    [5.38816750e-01, 4.19194520e-01, 6.85219526e-01]],
+
+                                    [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02],
+                                    [6.70467496e-01, 4.17304814e-01, 5.58689833e-01]],
+
+                                    [[1.40386939e-01, 1.98101491e-01, 8.00744593e-01],
+                                    [9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
+
+                                    [[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
+                                    [3.90547849e-02, 1.69830427e-01, 8.78142476e-01]]]).astype("float32")
+            labels = np.array([[1, 2, 2],
+                            [1, 2, 2]]).astype("int32")
+            input_lengths = np.array([5, 5]).astype("int64")
+            label_lengths = np.array([3, 3]).astype("int64")
+
+            paddle.disable_static()
+            log_probs = paddle.to_tensor(log_probs)
+            labels = paddle.to_tensor(labels)
+            input_lengths = paddle.to_tensor(input_lengths)
+            label_lengths = paddle.to_tensor(label_lengths)
+
+            loss = paddle.nn.CTCLoss(blank=0, reduction='none')(log_probs, labels,
+                input_lengths,
+                label_lengths)
+            print(loss.numpy())  #[3.9179852 2.9076521]
+
+            loss = paddle.nn.CTCLoss(blank=0, reduction='mean')(log_probs, labels,
+                input_lengths,
+                label_lengths)
+            print(loss.numpy())  #[1.1376063]
+    """
+
+    def __init__(self, blank=0, reduction='mean'):
+        super(CTCLoss, self).__init__()
+        self.blank = blank
+        self.reduction = reduction
+
+    def forward(self, log_probs, labels, input_lengths, label_lengths):
+        return paddle.nn.functional.ctc_loss(log_probs, labels, input_lengths,
+                                             label_lengths, self.blank,
+                                             self.reduction)
+
+
+class SmoothL1Loss(fluid.dygraph.Layer):
+    """
+    This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
+    term if the absolute element-wise error falls below 1 and an L1 term otherwise.
+    In some cases it can prevent exploding gradients and it is more robust and less
+    sensitivity to outliers. Also known as the Huber loss:
+
+    .. math::
+
+         loss(x,y)=\\frac{1}{n}\\sum_{i}z_i
+
+    where z_i is given by:
+
+    .. math::
+
+         \\mathop{z_i}=\\left\\{\\begin{array}{rcl}
+        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\\\
+        delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise}
+        \\end{array} \\right.
+
+    Parameters:
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
+            Default is ``'mean'``.
+        delta (float, optional): Specifies the hyperparameter delta to be used.
+            The value determines how large the errors need to be to use L1. Errors
+            smaller than delta are minimized with L2. Parameter is ignored for
+            negative/zero values. Default = 1.0
+        name (str, optional): Name for the operation (optional, default is
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Call Parameters:
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is
+            (N, C), where C is number of classes, and if shape is more than 2D, this
+            is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor, the data type is float32 or float64. The shape of label
+            is the same as the shape of input.
+
+    Returns:
+        The tensor variable storing the smooth_l1_loss of input and label.
+
+    Return type: Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(3,3).astype("float32")
+            label_data = np.random.rand(3,3).astype("float32")
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            loss = paddle.nn.SmoothL1Loss()
+            output = loss(input, label)
+            print(output.numpy())
+    """
+
+    def __init__(self, reduction='mean', delta=1.0, name=None):
+        super(SmoothL1Loss, self).__init__()
+        self.reduction = reduction
+        self.delta = delta
+        self.name = name
+
+    def forward(self, input, label):
+        return F.smooth_l1_loss(
+            input,
+            label,
+            reduction=self.reduction,
+            delta=self.delta,
+            name=self.name)
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 1d00f9c7b8b0204affed690bcea2f0ff78a943d1..c7855b23bf6e6861326533e3cc93d7f7c5bd4ca2 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -1,4 +1,17 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,12 +30,1085 @@
 from ...fluid.dygraph.nn import InstanceNorm
 
 from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
+#from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
+
+#from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import SyncBatchNorm  #DEFINE_ALIAS
+
+from ...fluid.dygraph import layers
+
+from ...framework import get_default_dtype, set_default_dtype
+from ...fluid.framework import in_dygraph_mode
+
+from ...fluid.initializer import Constant
+from ...fluid.param_attr import ParamAttr
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
+from ...fluid import core, dygraph_utils
+
+from ..functional import batch_norm, layer_norm, instance_norm
+
+import numpy as np
+import numbers
+import warnings
 
 __all__ = [
     'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
-    'SyncBatchNorm'
+    'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d',
+    'InstanceNorm2d', 'InstanceNorm3d', 'SyncBatchNorm'
 ]
+
+
+class _InstanceNormBase(layers.Layer):
+    """
+    This class is based class for InstanceNorm1d, 2d, 3d. 
+
+    See InstaceNorm1d, InstanceNorm2d or InstanceNorm3d for more details.
+    """
+
+    def __init__(self,
+                 num_features,
+                 epsilon=1e-5,
+                 momentum=0.9,
+                 weight_attr=None,
+                 bias_attr=None,
+                 track_running_stats=False,
+                 data_format="NCHW",
+                 name=None):
+        super(_InstanceNormBase, self).__init__()
+
+        if weight_attr == False or bias_attr == False:
+            assert weight_attr == param_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
+        self._epsilon = epsilon
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        if weight_attr != False and bias_attr != False:
+            self.scale = self.create_parameter(
+                attr=self._weight_attr,
+                shape=[num_features],
+                default_initializer=Constant(1.0),
+                is_bias=False)
+            self.bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[num_features],
+                default_initializer=Constant(0.0),
+                is_bias=True)
+        else:
+            self.scale = None
+            self.bias = None
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError("InstanceNorm Base error")
+
+    def forward(self, input):
+        self._check_input_dim(input)
+
+        return instance_norm(
+            input, weight=self.scale, bias=self.bias, eps=self._epsilon)
+
+
+class InstanceNorm1d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCL `[batch, in_channels, length]`
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL".
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+
+    Shape:
+        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
+        - output: 3-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm1d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 2 and len(input.shape) != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class InstanceNorm2d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
+
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format, could be "NCHW". Default: NCHW.
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm2d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 4:
+            raise ValueError('expected 4D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class InstanceNorm3d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`
+
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format, could be "NCDHW". Default: NCDHW.
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
+        - output: 5-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm3d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 5:
+            raise ValueError('expected 5D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class GroupNorm(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``GroupNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Group Normalization Layer.
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
+
+    Parameters:
+        num_channels(int): The number of channels of input.
+        num_groups(int): The number of groups that divided from channels.
+        epsilon(float, optional): The small value added to the variance to prevent
+                                  division by zero. Default: 1e-05.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+                                         scale :math:`g`. If it is set to False, no scale will be added to the output units.
+                                         If it is set to None, the bias is initialized one. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+                                        bias :math:`b`. If it is set to False, no bias will be added to the output units.
+                                        If it is set to None, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
+        name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 6, 2, 2)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          group_norm = paddle.nn.GroupNorm(num_channels=3, num_groups=6)
+          group_norm_out = group_norm(x)
+
+          print(group_norm_out.numpy)
+    """
+
+    def __init__(self,
+                 num_channels,
+                 num_groups,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_layout='NCHW',
+                 name=None):
+        super(GroupNorm, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._epsilon = epsilon
+        self._num_channels = num_channels
+        self._num_groups = num_groups
+        if data_layout != 'NCHW':
+            raise ValueError("unsupported data layout:" + data_layout)
+
+        param_shape = [self._num_channels]
+
+        self.weight = self.create_parameter(
+            attr=self._weight_attr or False,
+            shape=param_shape,
+            default_initializer=Constant(1.0))
+
+        self.bias = self.create_parameter(
+            attr=self._weight_attr or False, shape=param_shape, is_bias=True)
+
+    def forward(self, input):
+        inputs = {'X': input}
+        if self.bias is not None:
+            inputs['Bias'] = self.bias
+        if self.weight is not None:
+            inputs['Scale'] = self.weight
+
+        # create output
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        group_norm_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype)
+
+        self._helper.append_op(
+            type="group_norm",
+            inputs=inputs,
+            outputs={
+                "Y": group_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={"epsilon": self._epsilon,
+                   "groups": self._num_groups})
+
+        return self._helper.append_activation(group_norm_out, None)
+
+
+class LayerNorm(layers.Layer):
+    """
+    :alias_main: paddle.nn.LayerNorm
+	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
+	:old_api: paddle.fluid.dygraph.LayerNorm
+
+    This interface is used to construct a callable object of the ``LayerNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
+    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+
+    The formula is as follows:
+
+    ..  math::
+
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i
+
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon}
+
+        y & = f(\\frac{g}{\\sigma}(x - \\mu) + b)
+
+    - :math:`x`: the vector representation of the summed inputs to the neurons in that layer.
+    - :math:`H`: the number of hidden units in a layers
+    - :math:`\\epsilon`: the small value added to the variance to prevent division by zero.
+    - :math:`g`: the trainable scale parameter.
+    - :math:`b`: the trainable bias parameter.
+
+    Parameters:
+        normalized_shape(int|list|tuple): Input shape from an expected input of
+            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
+            If it is a single integer, this module will normalize over the last dimension
+            which is expected to be of that specific size.
+        epsilon(float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The
+            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
+            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
+        name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 2-D, 3-D, 4-D or 5-D tensor.
+        - output: same shape as input x.
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          layer_norm = paddle.nn.LayerNorm(x_data.shape[1:])
+          layer_norm_out = layer_norm(x)
+
+          print(layer_norm_out.numpy)
+    """
+
+    def __init__(self,
+                 normalized_shape,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = [normalized_shape]
+
+        self._normalized_shape = list(normalized_shape)
+        self._epsilon = epsilon
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        param_shape = [np.prod(self._normalized_shape)]
+
+        if weight_attr is False:
+            self.weight = None
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+
+        if bias_attr is False:
+            self.bias = None
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+
+    def forward(self, input):
+        return layer_norm(
+            input,
+            normalized_shape=self._normalized_shape,
+            weight=self.weight,
+            bias=self.bias,
+            epsilon=self._epsilon)
+
+
+class _BatchNormBase(layers.Layer):
+    """
+    BatchNorm base .
+    """
+
+    def __init__(self,
+                 num_features,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW',
+                 track_running_stats=True,
+                 name=None):
+        super(_BatchNormBase, self).__init__()
+        self._num_features = num_features
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        if get_default_dtype() == 'float16':
+            set_default_dtype('float32')
+
+        param_shape = [num_features]
+
+        # create parameter
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=param_shape,
+            default_initializer=Constant(1.0))
+        self.weight.stop_gradient = (self._weight_attr is False) or (
+            self._weight_attr and self._weight_attr.learning_rate == 0.)
+
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=param_shape, is_bias=True)
+        self.bias.stop_gradient = (self._bias_attr is False) or (
+            self._bias_attr and self._bias_attr.learning_rate == 0.)
+
+        moving_mean_name = None
+        moving_variance_name = None
+
+        if name is not None:
+            moving_mean_name = name + "_mean"
+            moving_variance_name = name + "_variance"
+
+        self._mean = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean_name,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_variance_name,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+
+        self._data_format = data_format
+        self._in_place = False
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._fuse_with_relu = False
+        self._track_running_stats = track_running_stats
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError("BatchNorm Base error")
+
+    def forward(self, input):
+
+        self._check_input_dim(input)
+
+        if not self.training and not self._track_running_stats:
+            raise ValueError(
+                'When inference, expected track_running_stats is True.')
+
+        if self.training and not self._track_running_stats:
+            warnings.warn(
+                "When training, we now always track global mean and variance.")
+
+        return batch_norm(
+            input,
+            self._mean,
+            self._variance,
+            weight=self.weight,
+            bias=self.bias,
+            training=self.training,
+            momentum=self._momentum,
+            epsilon=self._epsilon,
+            data_format=self._data_format)
+
+
+class BatchNorm1d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL".
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
+        - output: 3-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+    
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm1d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 2 and len(input.shape) != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class BatchNorm2d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm2d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 4:
+            raise ValueError('expected 4D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class BatchNorm3d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCDHW". Default: NCDHW.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
+        - output: 5-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm3d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 5:
+            raise ValueError('expected 5D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class SyncBatchNorm(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
+    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
+    be used as a normalizer function for other operations, such as conv2d and fully connected 
+    operations.
+    The data is normalized by the mean and variance of the channel based on whole mini-batch
+    , which including data in all gpus.
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    When model in training mode, the :math:`\\mu_{\\beta}` 
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    - :math:`x` : whole mini-batch data in all gpus
+    - :math:`m` : the size of the whole mini-batch data
+
+    When model in evaluation mode, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, 
+    which usually got from the pre-trained model). Global statistics calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The formula of normalization is as follows:
+ 
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\eps` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable scale parameter vector
+    - :math:`\\beta` : trainable shift parameter vector 
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of this layer. If it is set to None or one attribute of ParamAttr, this layerr
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. If it is set to False, 
+             this layer will not have trainable scale parameter. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
+             If it is set to None or one attribute of ParamAttr, this layer
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. If it is set to False, this layer will not 
+             have trainable bias parameter. Default: None.
+        track_running_stats(bool, optional): Whether to compute global stats, which including running mean and 
+             running variance. Default: True.
+
+    Shapes:
+        input: Tensor that the dimension from 2 to 5.
+        output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+
+          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+          paddle.disable_static()
+          x = paddle.to_tensor(x)
+          if paddle.fluid.is_compiled_with_cuda():
+              sync_batch_norm = nn.SyncBatchNorm(2)
+              hidden1 = sync_batch_norm(x)
+              print(hidden1.numpy())
+              # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
+    """
+
+    def __init__(self,
+                 num_features,
+                 epsilon=1e-05,
+                 momentum=0.9,
+                 track_running_stats=True,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW',
+                 name=None):
+        super(SyncBatchNorm, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._num_features = num_features
+        self._data_layout = data_format
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._track_running_stats = track_running_stats
+
+        if self._track_running_stats == False:
+            warnings.warn(
+                "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
+            )
+
+        param_shape = [self._num_features]
+
+        # create parameter
+        if weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+
+        if bias_attr == False:
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+            self.bias.stop_gradient = True
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+
+        self._mean = self.create_parameter(
+            attr=ParamAttr(
+                name=None,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self.create_parameter(
+            attr=ParamAttr(
+                name=None,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+
+    def forward(self, x):
+        # create output
+        # mean and mean_out share the same memory
+        mean_out = self._mean
+        # variance and variance out share the same memory
+        variance_out = self._variance
+
+        ### train mode: use mini-batch stats, eval mode: use global stats
+        ### use_global_stats only support False in sync_batch_norm
+        if in_dygraph_mode():
+            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
+                     "is_test", not self.training, "data_layout",
+                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
+                     False, "use_global_stats", False, 'trainable_statistics',
+                     False)
+            sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
+                x, self.weight, self.bias, self._mean, self._variance, mean_out,
+                variance_out, *attrs)
+
+            return sync_batch_norm_out
+
+        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                                 'BatchNorm')
+
+        attrs = {
+            "momentum": self._momentum,
+            "epsilon": self._epsilon,
+            "is_test": not self.training,
+            "data_layout": self._data_layout,
+            "use_mkldnn": False,
+            "fuse_with_relu": False,
+            "use_global_stats": False,
+            "trainable_statistics": False,
+        }
+
+        inputs = {
+            "X": [x],
+            "Scale": [self.weight],
+            "Bias": [self.bias],
+            "Mean": [self._mean],
+            "Variance": [self._variance]
+        }
+
+        saved_mean = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        saved_variance = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        sync_batch_norm_out = self._helper.create_variable_for_type_inference(
+            self._dtype)
+
+        outputs = {
+            "Y": [sync_batch_norm_out],
+            "MeanOut": [mean_out],
+            "VarianceOut": [variance_out],
+            "SavedMean": [saved_mean],
+            "SavedVariance": [saved_variance]
+        }
+
+        self._helper.append_op(
+            type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+        return sync_batch_norm_out
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
new file mode 100755
index 0000000000000000000000000000000000000000..87fa0caec9ee287c42d8308d9da25c6d2fc9b911
--- /dev/null
+++ b/python/paddle/nn/layer/pooling.py
@@ -0,0 +1,877 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+from ...fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ...fluid.layers import utils
+from ...fluid.dygraph import layers
+from ...fluid.layer_helper import LayerHelper
+from .. import functional as F
+
+__all__ = [
+    'AdaptiveAvgPool2d',
+    'AdaptiveAvgPool3d',
+    'AvgPool1d',
+    'maxPool1d',
+    'AdaptiveMaxPool1d',
+    'AdaptiveAvgPool1d',
+    'AvgPool2d',
+    'MaxPool2d',
+    'AvgPool3d',
+    'MaxPool3d',
+]
+
+
+class AdaptiveAvgPool2d(layers.Layer):
+    """
+
+    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two element, (H, W). H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
+            the order of: [batch_size, input_channels, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32 or float64.
+        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool2d.
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+            #
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out.shape is [2, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCHW", name=None):
+        super(AdaptiveAvgPool2d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool2d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)
+
+
+class AdaptiveAvgPool3d(layers.Layer):
+    """
+
+    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
+            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32 or float64.
+        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool3d.
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 8, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out = [2, 3, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCDHW", name=None):
+        super(AdaptiveAvgPool3d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool3d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)
+
+
+class AvgPool1d(layers.Layer):
+    """
+    This operation applies a 1D average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
+
+    ..  math::
+
+       Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k])
+
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one integers.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain one integers.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
+            then the input is implicitly zero-padded on both sides for padding number of points.
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
+            If it is set to False, the floor function will be used. Default False
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        None.
+
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Examples:
+
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AvgPool1d = nn.AvgPool1d(kernel_size=2, stride=2, padding=0)
+          pool_out = AvgPool1d(data)
+          # pool_out shape: [1, 3, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 count_include_pad=True,
+                 ceil_mode=False,
+                 name=None):
+        super(AvgPool1d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.name = name
+
+    def forward(self, x):
+        out = F.avg_pool1d(x, self.kernel_size, self.stride, self.padding,
+                           self.count_include_pad, self.ceil_mode, self.name)
+        return out
+
+
+class MaxPool1d(layers.Layer):
+    """
+    Applies a 1D max pooling over an input signal composed of several input planes based
+    on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
+
+    ..  math::
+
+       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one integers.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain one integers.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be the following forms: `[pad_left, pad_right]`.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        None.
+
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
+          pool_out = MaxPool1d(data)
+          # pool_out shape: [1, 3, 16]
+
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0, return_indices=True)
+          pool_out, indices = MaxPool1d(data)
+          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 return_indices=False,
+                 ceil_mode=False,
+                 name=None):
+        super(MaxPool1d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.return_indices = return_indices
+        self.name = name
+
+    def forward(self, input):
+        out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
+                           self.return_indices, self.ceil_mode, self.name)
+        return out
+
+
+class AdaptiveAvgPool1d(layers.Layer):
+    """
+
+    This operation applies a 1D adaptive average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For average adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+
+    Args:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one int.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        None.
+
+    Raises:
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Examples:
+        .. code-block:: python
+
+          # average adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+          #
+          import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveAvgPool1d = nn.AdaptiveAvgPool1d(output_size=16)
+          pool_out = AdaptiveAvgPool1d(data)
+          # pool_out shape: [1, 3, 16]
+    """
+
+    def __init__(self, output_size, name=None):
+        super(AdaptiveAvgPool1d, self).__init__()
+        self.output_size = output_size
+        self.name = name
+
+    def forward(self, input):
+        return F.adaptive_avg_pool1d(input, self.output_size, self.name)
+
+
+class AdaptiveMaxPool1d(layers.Layer):
+    """
+
+    This operation applies a 1D adaptive max pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For max adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= max(Input[lstart:lend])}
+
+    Args:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+             it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Returns:
+        None.
+
+    Raises:
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Examples:
+        .. code-block:: python
+
+          # max adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = max(input[:, :, lstart: lend])
+          #
+                    import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16)
+          pool_out = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16]
+
+          # for return_indices = true
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16, return_indices=True)
+          pool_out, indices = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool1d, self).__init__()
+        self.output_size = output_size
+        self.return_indices = return_indices
+        self.name = name
+
+    def forward(self, input):
+        return F.adaptive_max_pool1d(input, self.output_size,
+                                     self.return_indices, self.name)
+
+
+class AvgPool2d(layers.Layer):
+    """
+    This operation applies 2D average pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           $$
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Otherwise, the pool padding size will be a square of an int.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+
+    Returns: None.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          AvgPool2d = nn.AvgPool2d(kernel_size=2,
+                                stride=2, padding=0)
+          output = AvgPoo2d(input)
+          # output.shape [1, 3, 16, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCHW",
+                 name=None):
+        super(AvgPool2d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.avg_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)
+
+
+class MaxPool2d(layers.Layer):
+    """
+    This operation applies 2D max pooling over input feature based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
+           $$
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Otherwise, the pool padding size will be a square of an int.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns: None
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = MaxPool2d(input)
+          # output.shape [1, 3, 16, 16]
+
+          # for return_indices=True
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,stride=2, padding=0, return_indices=True)
+          output, max_indices = MaxPool2d(input)
+          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 return_indices=False,
+                 ceil_mode=False,
+                 data_format="NCHW",
+                 name=None):
+        super(MaxPool2d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.max_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            return_indices=self.return_indices,
+            data_format=self.data_format,
+            name=self.name)
+
+
+class MaxPool3d(layers.Layer):
+    """
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int. Default kernel_size.
+        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        ceil_mode (bool): when True, will use ceil instead of floor to compute the output shape.
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+
+    Returns:None.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+          MaxPool3d = nn.MaxPool3d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = MaxPool3d(input)
+          # output.shape [1, 2, 3, 16, 16]
+
+          # for return_indices=True
+          MaxPool3d = nn.MaxPool3d(kernel_size=2,stride=2, padding=0, return_indices=True)
+          output, max_indices = MaxPool3d(input)
+          # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride,
+                 padding,
+                 return_indices=False,
+                 ceil_mode=False,
+                 data_format="NCDHW",
+                 name=None):
+        super(MaxPool3d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.max_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            return_indices=self.return_indices,
+            data_format=self.data_format,
+            name=self.name)
+
+
+class AvgPool3d(layers.Layer):
+    """
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns: None.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # avg pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+          AvgPool3d = nn.AvgPool3d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = AvgPool3d(input)
+          # output.shape [1, 2, 3, 16, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride,
+                 padding=0,
+                 ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCDHW",
+                 name=None):
+        super(AvgPool3d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.avg_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 4717609503f7faafc16d8c15e3d404b0d780c3e1..6f1c5f199ac99692840ad3c5cffdb726baf5fa19 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -12,10 +12,1333 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define classes of recurrent neural network  
+import copy
+import collections
+import itertools
+import six
+import math
+import sys
+import warnings
+from functools import partial, reduce
+
+import paddle
+from paddle import framework
+from paddle.nn import functional as F
+from paddle.nn import initializer as I
+from paddle.fluid.dygraph import Layer, LayerList
+from paddle.fluid.layers import utils
+from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
+from paddle.fluid.data_feeder import convert_dtype
 
 __all__ = [
-    #       'RNNCell',
-    #       'GRUCell',
-    #       'LSTMCell'
+    'RNNCellBase',
+    'SimpleRNNCell',
+    'LSTMCell',
+    'GRUCell',
+    'RNN',
+    'BiRNN',
+    'SimpleRNN',
+    'LSTM',
+    'GRU',
 ]
+
+
+def split_states(states, bidirectional=False, state_components=1):
+    r"""
+    Split states of RNN network into possibly nested list or tuple of
+    states of each RNN cells of the RNN network.
+
+    Arguments:
+        states (Tensor|tuple|list): the concatenated states for RNN network.
+            When `state_components` is 1, states in a Tensor with shape
+            `(L*D, N, C)` where `L` is the number of layers of the RNN 
+            network, `D` is the number of directions of the RNN network(1 
+            for unidirectional RNNs and 2 for bidirectional RNNs), `N` is 
+            the batch size of the input to the RNN network, `C` is the 
+            hidden size of the RNN network. 
+
+            When `state_components` is larger than 1, `states` is a tuple of 
+            `state_components` Tensors that meet the requirements described 
+            above. 
+            
+            For SimpleRNNs and GRUs, `state_components` is 1, and for LSTMs, 
+            `state_components` is 2.
+        bidirectional (bool): whether the state is of a bidirectional RNN 
+            network. Defaults to False.
+        state_components (int): the number of the components of the states. see
+            `states` above. Defaults to 1.
+    
+    Returns:
+        A nested list or tuple of RNN cell states. 
+        If `bidirectional` is True, it can be indexed twice to get an RNN 
+        cell state. The first index indicates the layer, the second index 
+        indicates the direction.
+        If `bidirectional` is False, it can be indexed once to get an RNN
+        cell state. The index indicates the layer.
+        Note that if `state_components` is larger than 1, an RNN cell state
+        can be indexed one more time to get a tensor of shape(N, C), where 
+        `N` is the batch size of the input to the RNN cell, and `C` is the
+        hidden size of the RNN cell.
+    """
+    if state_components == 1:
+        states = paddle.unstack(states)
+        if not bidirectional:
+            return states
+        else:
+            return list(zip(states[::2], states[1::2]))
+    else:
+        assert len(states) == state_components
+        states = tuple([paddle.unstack(item) for item in states])
+        if not bidirectional:
+            return list(zip(*states))
+        else:
+            states = list(zip(*states))
+            return list(zip(states[::2], states[1::2]))
+
+
+def concat_states(states, bidirectional=False, state_components=1):
+    r"""
+    Concatenate a possibly nested list or tuple of RNN cell states into a 
+    compact form.
+
+    Arguments:
+        states (list|tuple): a possibly nested list or tuple of RNN cell 
+            states. 
+            If `bidirectional` is True, it can be indexed twice to get an 
+            RNN cell state. The first index indicates the layer, the second 
+            index indicates the direction.
+            If `bidirectional` is False, it can be indexed once to get an RNN
+            cell state. The index indicates the layer.
+            Note that if `state_components` is larger than 1, an RNN cell 
+            state can be indexed one more time to get a tensor of shape(N, C), 
+            where `N` is the batch size of the input to the RNN cell, and 
+            `C` is the hidden size of the RNN cell. 
+        bidirectional (bool): whether the state is of a bidirectional RNN 
+            network. Defaults to False.
+        state_components (int): the number of the components of the states. see
+            `states` above. Defaults to 1.
+    
+    Returns:
+        Concatenated states for RNN network.
+        When `state_components` is 1, states in a Tensor with shape
+        `(L\*D, N, C)` where `L` is the number of layers of the RNN 
+        network, `D` is the number of directions of the RNN network(1 for 
+        unidirectional RNNs and 2 for bidirectional RNNs), `N` is the batch 
+        size of the input to the RNN network, `C` is the hidden size of the 
+        RNN network.
+        
+    """
+    if state_components == 1:
+        return paddle.stack(flatten(states))
+    else:
+        states = flatten(states)
+        componnets = []
+        for i in range(state_components):
+            componnets.append(states[i::state_components])
+        return [paddle.stack(item) for item in componnets]
+
+
+class RNNCellBase(Layer):
+    r"""
+    RNNCellBase is the base class for abstraction representing the calculations
+    mapping the input and state to the output and new state. It is suitable to
+    and mostly used in RNN.
+    """
+
+    def get_initial_states(self,
+                           batch_ref,
+                           shape=None,
+                           dtype=None,
+                           init_value=0.,
+                           batch_dim_idx=0):
+        r"""
+        Generate initialized states according to provided shape, data type and
+        value.
+        Arguments:
+            batch_ref (Tensor): A tensor, which shape would be used to 
+                determine the batch size, which is used to generate initial 
+                states. For `batch_ref`'s shape d, `d[batch_dim_idx]` is 
+                treated as batch size.
+            shape (list|tuple, optional): A (possibly nested structure of) shape[s], 
+                where a shape is a list/tuple of integer). `-1` (for batch size) 
+                will be automatically prepended if a shape does not starts with 
+                it. If None, property `state_shape` will be used. Defaults to 
+                None.
+            dtype (str|list|tuple, optional): A (possibly nested structure of) 
+                data type[s]. The structure must be same as that of `shape`, 
+                except when all tensors' in states has the same data type, a 
+                single data type can be used. If None and property `cell.state_shape` 
+                is not available, current default floating type of paddle is 
+                used. Defaults to None.
+            init_value (float, optional): A float value used to initialize states. 
+                Defaults to 0.
+            batch_dim_idx (int, optional): An integer indicating which 
+                dimension of the of `batch_ref` represents batch. Defaults to 0.
+        Returns:
+            init_states (Tensor|tuple|list): tensor of the provided shape and 
+                dtype, or list of tensors that each satisfies the requirements,
+                packed in the same structure as `shape` and `type` does.
+        """
+        # TODO: use inputs and batch_size
+        batch_ref = flatten(batch_ref)[0]
+
+        def _is_shape_sequence(seq):
+            if sys.version_info < (3, ):
+                integer_types = (
+                    int,
+                    long, )
+            else:
+                integer_types = (int, )
+            """For shape, list/tuple of integer is the finest-grained objection"""
+            if (isinstance(seq, list) or isinstance(seq, tuple)):
+                if reduce(lambda flag, x: isinstance(x, integer_types) and flag,
+                          seq, True):
+                    return False
+            # TODO: Add check for the illegal
+            if isinstance(seq, dict):
+                return True
+            return (isinstance(seq, collections.Sequence) and
+                    not isinstance(seq, six.string_types))
+
+        class Shape(object):
+            def __init__(self, shape):
+                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
+
+        # nested structure of shapes
+        states_shapes = self.state_shape if shape is None else shape
+        is_sequence_ori = utils.is_sequence
+        utils.is_sequence = _is_shape_sequence
+        states_shapes = map_structure(lambda shape: Shape(shape), states_shapes)
+        utils.is_sequence = is_sequence_ori
+
+        # nested structure of dtypes
+        try:
+            states_dtypes = self.state_dtype if dtype is None else dtype
+        except NotImplementedError:
+            states_dtypes = framework.get_default_dtype()
+        if len(flatten(states_dtypes)) == 1:
+            dtype = flatten(states_dtypes)[0]
+            states_dtypes = map_structure(lambda shape: dtype, states_shapes)
+
+        init_states = map_structure(
+            lambda shape, dtype: paddle.fluid.layers.fill_constant_batch_size_like(
+                input=batch_ref,
+                shape=shape.shape,
+                dtype=dtype,
+                value=init_value,
+                input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
+        return init_states
+
+    @property
+    def state_shape(self):
+        r"""
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) shape[s], where a shape is a 
+        list/tuple of integers (-1 for batch size would be automatically
+        inserted into a shape if shape is not started with it).
+        Not necessary to be implemented if states are not initialized by
+        `get_initial_states` or the `shape` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_shape` in the used cell.")
+
+    @property
+    def state_dtype(self):
+        r"""
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) data types[s]. The structure must be
+        same as that of `shape`, except when all tensors' in states has the same
+        data type, a signle data type can be used.
+        Not necessary to be implemented if states are not initialized
+        by `get_initial_states` or the `dtype` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_dtype` in the used cell.")
+
+
+class SimpleRNNCell(RNNCellBase):
+    r"""
+    Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it 
+    computes the outputs and updates states.
+
+    The formula used is as follows:
+
+    .. math::
+        h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+        y_{t} & = h_{t}
+    
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Please refer to `Finding Structure in Time 
+    <https://crl.ucsd.edu/~elman/Papers/fsit.pdf>`_ for more details.
+    
+    Arguments:
+        input_size (int): The input size.
+        hidden_size (int): The hidden size.
+        activation (str, optional): The activation in the SimpleRNN cell. 
+            It can be `tanh` or `relu`. Defaults to `tanh`.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_ih`. Default: None.
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_hh`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh`. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Parameters:
+        weight_ih (Parameter): shape (hidden_size, input_size), input to hidden 
+            weight, corresponding to :math:`W_{ih}` in the formula.
+        weight_hh (Parameter): shape (hidden_size, hidden_size), hidden to 
+            hidden weight, corresponding to :math:`W_{hh}` in the formula.
+        bias_ih (Parameter): shape (hidden_size, ), input to hidden bias, 
+            corresponding to :math:`b_{ih}` in the formula.
+        bias_hh (Parameter): shape (hidden_size, ), hidden to hidden bias, 
+            corresponding to :math:`b_{hh}` in the formula.
+    
+    Inputs:
+        inputs (Tensor): shape `[batch_size, input_size]`, the input, 
+                corresponding to :math:`x_t` in the formula.
+        states (Tensor, optional): shape `[batch_size, hidden_size]`, the
+            previous hidden state, corresponding to :math:`h_{t-1}` in the 
+            formula. When states is None, zero state is used. Defaults to 
+            None.
+
+    Returns:
+        (outputs, new_states)
+        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+            corresponding to :math:`h_{t}` in the formula.
+        states (Tensor): shape `[batch_size, hidden_size]`, the new hidden 
+            state, corresponding to :math:`h_{t}` in the formula.
+    
+    Notes:
+        All the weights and bias are initialized with `Uniform(-std, std)` by 
+        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
+        information about parameter initialization, please refer to
+         :ref:`api_fluid_ParamAttr`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            x = paddle.randn((4, 16))
+            prev_h = paddle.randn((4, 32))
+
+            cell = paddle.nn.SimpleRNNCell(16, 32)
+            y, h = cell(x, prev_h)
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 activation="tanh",
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(SimpleRNNCell, self).__init__()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = self.create_parameter(
+            (hidden_size, input_size),
+            weight_ih_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.weight_hh = self.create_parameter(
+            (hidden_size, hidden_size),
+            weight_hh_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_ih = self.create_parameter(
+            (hidden_size, ),
+            bias_ih_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_hh = self.create_parameter(
+            (hidden_size, ),
+            bias_hh_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        if activation not in ["tanh", "relu"]:
+            raise ValueError(
+                "activation for SimpleRNNCell should be tanh or relu, "
+                "but get {}".format(activation))
+        self.activation = activation
+        self._activation_fn = paddle.tanh \
+            if activation == "tanh" \
+            else F.relu
+
+    def forward(self, inputs, states=None):
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
+        pre_h = states
+        i2h = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
+        if self.bias_ih is not None:
+            i2h += self.bias_ih
+        h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
+        if self.bias_hh is not None:
+            h2h += self.bias_hh
+        h = self._activation_fn(i2h + h2h)
+        return h, h
+
+    @property
+    def state_shape(self):
+        return (self.hidden_size, )
+
+
+class LSTMCell(RNNCellBase):
+    r"""
+    Long-Short Term Memory(LSTM) RNN cell. Given the inputs and previous states, 
+    it computes the outputs and updates states.
+
+    The formula used is as follows:
+
+    .. math::
+        i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
+        f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
+        o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
+        \\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
+        c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t}
+        h_{t} & = o_{t} \* \\tanh(c_{t})
+        y_{t} & = h_{t}
+
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+
+    Arguments:
+        input_size (int): The input size.
+        hidden_size (int): The hidden size.
+        weight_ih_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_ih`. Default: None.
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_hh`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh`. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Parameters:
+        weight_ih (Parameter): shape (4 * hidden_size, input_size), input to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{ii}, W_{if}, W_{ig}, W_{io}` in the formula.
+        weight_hh (Parameter): shape (4 * hidden_size, hidden_size), hidden to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{hi}, W_{hf}, W_{hg}, W_{ho}` in the formula.
+        bias_ih (Parameter): shape (4 * hidden_size, ), input to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{ii}, b_{if}, b_{ig}, b_{io}` in the formula.
+        bias_hh (Parameter): shape (4 * hidden_size, ), hidden to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{hi}, b_{hf}, b_{hg}, b_{ho}` in the formula.
+
+    Inputs:
+        inputs (Tensor): shape `[batch_size, input_size]`, the input, 
+            corresponding to :math:`x_t` in the formula.
+        states (tuple, optional): a tuple of two tensors, each of shape 
+            `[batch_size, hidden_size]`, the previous hidden state, 
+            corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. 
+            When states is None, zero state is used. Defaults to None.
+
+    Returns:
+        (outputs, new_states)
+        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+            corresponding to :math:`h_{t}` in the formula.
+        states (tuple): a tuple of two tensors, each of shape 
+            `[batch_size, hidden_size]`, the new hidden states,
+            corresponding to :math:`h_{t}, c{t}` in the formula.
+
+    Notes:
+        All the weights and bias are initialized with `Uniform(-std, std)` by 
+        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
+        information about parameter initialization, please refer to
+         :ref:`api_fluid_ParamAttr`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            x = paddle.randn((4, 16))
+            prev_h = paddle.randn((4, 32))
+            prev_c = paddle.randn((4, 32))
+
+            cell = paddle.nn.LSTMCell(16, 32)
+            y, (h, c) = cell(x, (prev_h, prev_c))
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(LSTMCell, self).__init__()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = self.create_parameter(
+            (4 * hidden_size, input_size),
+            weight_ih_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.weight_hh = self.create_parameter(
+            (4 * hidden_size, hidden_size),
+            weight_hh_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_ih = self.create_parameter(
+            (4 * hidden_size, ),
+            bias_ih_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_hh = self.create_parameter(
+            (4 * hidden_size, ),
+            bias_hh_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self._gate_activation = F.sigmoid
+        self._activation = paddle.tanh
+
+    def forward(self, inputs, states=None):
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
+        pre_hidden, pre_cell = states
+        gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
+        if self.bias_ih is not None:
+            gates = gates + self.bias_ih
+        gates += paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
+        if self.bias_hh is not None:
+            gates = gates + self.bias_hh
+
+        chunked_gates = paddle.split(gates, num_or_sections=4, axis=-1)
+
+        i = self._gate_activation(chunked_gates[0])
+        f = self._gate_activation(chunked_gates[1])
+        o = self._gate_activation(chunked_gates[3])
+        c = f * pre_cell + i * self._activation(chunked_gates[2])
+        h = o * self._activation(c)
+
+        return h, (h, c)
+
+    @property
+    def state_shape(self):
+        r"""
+        The `state_shape` of LSTMCell is a tuple with two shapes: 
+        `((hidden_size, ), (hidden_size,))`. (-1 for batch size would be 
+        automatically inserted into shape). These two shapes correspond 
+        to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
+        """
+        return ((self.hidden_size, ), (self.hidden_size, ))
+
+
+class GRUCell(RNNCellBase):
+    r"""
+    Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, 
+    it computes the outputs and updates states.
+
+    The formula for GRU used is as follows:
+
+    .. math::
+
+        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
+        z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
+        \\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc}))
+        h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t}
+        y_{t} & = h_{t}
+    
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+
+    Parameters:
+        input_size (int): The input size..
+        hidden_size (int): The hidden size.
+        weight_ih_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_ih`. Default: None.
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_hh`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh`. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Parameters:
+        weight_ih (Parameter): shape (3 * hidden_size, input_size), input to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{ir}, W_{iz}, W_{ic}` in the formula.
+        weight_hh (Parameter): shape (3 * hidden_size, hidden_size), hidden to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{hr}, W_{hz}, W_{hc}` in the formula.
+        bias_ih (Parameter): shape (3 * hidden_size, ), input to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{ir}, b_{iz}, b_{ic}` in the formula.
+        bias_hh (Parameter): shape (3 * hidden_size, ), hidden to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{hr}, b_{hz}, b_{hc}` in the formula.
+
+    Inputs:
+        inputs (Tensor): A tensor with shape `[batch_size, input_size]`,
+            corresponding to :math:`x_t` in the formula.
+        states (Tensor): A tensor with shape `[batch_size, hidden_size]`.
+            corresponding to :math:`h_{t-1}` in the formula.
+
+    Returns:
+        (outputs, new_states)
+        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+            corresponding to :math:`h_{t}` in the formula.
+        states (Tensor): shape `[batch_size, hidden_size]`, the new hidden 
+            state, corresponding to :math:`h_{t}` in the formula.
+    
+    Notes:
+        All the weights and bias are initialized with `Uniform(-std, std)` by 
+        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
+        information about parameter initialization, please refer to
+         :ref:`api_fluid_ParamAttr`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            x = paddle.randn((4, 16))
+            prev_h = paddle.randn((4, 32))
+
+            cell = paddle.nn.GRUCell(16, 32)
+            y, h = cell(x, prev_h)
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(GRUCell, self).__init__()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = self.create_parameter(
+            (3 * hidden_size, input_size),
+            weight_ih_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.weight_hh = self.create_parameter(
+            (3 * hidden_size, hidden_size),
+            weight_hh_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_ih = self.create_parameter(
+            (3 * hidden_size, ),
+            bias_ih_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_hh = self.create_parameter(
+            (3 * hidden_size, ),
+            bias_hh_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self._gate_activation = F.sigmoid
+        self._activation = paddle.tanh
+
+    def forward(self, inputs, states=None):
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
+
+        pre_hidden = states
+        x_gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
+        if self.bias_ih is not None:
+            x_gates = x_gates + self.bias_ih
+        h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
+        if self.bias_hh is not None:
+            h_gates = h_gates + self.bias_hh
+
+        x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
+        h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
+
+        r = self._gate_activation(x_r + h_r)
+        z = self._gate_activation(x_z + h_z)
+        c = self._activation(x_c + r * h_c)  # apply reset gate after mm
+        h = (pre_hidden - c) * z + c
+
+        return h, h
+
+    @property
+    def state_shape(self):
+        r"""
+        The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
+        size would be automatically inserted into shape). The shape corresponds
+        to the shape of :math:`h_{t-1}`.
+        """
+        return (self.hidden_size, )
+
+
+class RNN(Layer):
+    r"""
+    Wrapper for RNN, which creates a recurrent neural network with an RNN cell. 
+    It performs :code:`cell.forward()` repeatedly until reaches to the maximum 
+    length of `inputs`.
+
+    Arguments:
+        cell(RNNCellBase): An instance of `RNNCellBase`.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Defaults to False.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+
+    Inputs:
+        inputs (Tensor): A (possibly nested structure of) tensor[s]. The input 
+            sequences. 
+            If time major is True, the shape is `[batch_size, time_steps, input_size]`
+            If time major is False, the shape is [time_steps, batch_size, input_size]`
+            where `input_size` is the input size of the cell.
+        initial_states (Tensor|list|tuple, optional): Tensor of a possibly 
+            nested structure of tensors, representing the initial state for 
+            the rnn cell. If not provided, `cell.get_initial_states` would be 
+            called to produce the initial states. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        **kwargs: Additional keyword arguments to pass to `forward` of the cell. 
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor|list|tuple): the output sequences.
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, hidden_size]`, else 
+            `[batch_size, time_steps, hidden_size]`.
+        final_states (Tensor|list|tuple): final states of the cell. Tensor or 
+            a possibly nested structure of tensors which has the same structure 
+            with intial state. Each tensor in final states has the same shape 
+            and dtype as the corresponding tensor in initial states.
+    
+    Notes:
+        This class is a low level API for wrapping rnn cell into a RNN network.
+        Users should take care of the state of the cell. If `initial_states` is 
+        passed to the `forward` method, make sure that it satisfies the 
+        requirements of the cell.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            inputs = paddle.rand((4, 23, 16))
+            prev_h = paddle.randn((4, 32))
+
+            cell = paddle.nn.SimpleRNNCell(16, 32)
+            rnn = paddle.nn.RNN(cell)
+            outputs, final_states = rnn(inputs, prev_h)
+
+    """
+
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            # for non-dygraph mode, `rnn` api uses cell.call
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        if initial_states is None:
+            initial_states = self.cell.get_initial_states(
+                batch_ref=inputs,
+                dtype=inputs.dtype,
+                batch_dim_idx=self.batch_index)
+
+        final_outputs, final_states = F.rnn(self.cell,
+                                            inputs,
+                                            initial_states=initial_states,
+                                            sequence_length=sequence_length,
+                                            time_major=self.time_major,
+                                            is_reverse=self.is_reverse,
+                                            **kwargs)
+        return final_outputs, final_states
+
+
+class BiRNN(Layer):
+    r"""
+    Wrapper for bidirectional RNN, which builds a bidiretional RNN given the 
+    forward rnn cell and backward rnn cell. A BiRNN applies forward RNN and 
+    backward RNN with coresponding cells separately and concats the outputs 
+    along the last axis.
+
+    Arguments:
+        cell_fw (RNNCellBase): A RNNCellBase instance used for forward RNN.
+        cell_bw (RNNCellBase): A RNNCellBase instance used for backward RNN.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+
+    Inputs:
+        inputs (Tensor): the input sequences of both RNN. 
+            If time_major is True, the shape of is 
+            `[time_steps, batch_size, input_size]`, else the shape is
+            `[batch_size, time_steps, input_size]`, where input_size is the 
+            input size of both cells.
+        initial_states (list|tuple, optional): A tuple/list of the initial 
+            states of the forward cell and backward cell. Defaults to None.
+            If not provided, `cell.get_initial_states` would be called to 
+            produce the initial states for each cell. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        **kwargs: Additional keyword arguments. Arguments passed to `forward` 
+            for each cell.
+
+    Outputs:
+        (outputs, final_states)
+        outputs (Tensor): the outputs of the bidirectional RNN. It is the 
+            concatenation of the outputs from the forward RNN and backward 
+            RNN along the last axis. 
+            If time major is True, the shape is `[time_steps, batch_size, size]`,
+            else the shape is `[batch_size, time_steps, size]`, where size is
+            `cell_fw.hidden_size + cell_bw.hidden_size`.
+        final_states (tuple): A tuple of the final states of the forward 
+            cell and backward cell. 
+
+    Notes:
+        This class is a low level API for wrapping rnn cells into a BiRNN 
+        network. Users should take care of the states of the cells. 
+        If `initial_states` is passed to the `forward` method, make sure that 
+        it satisfies the requirements of the cells.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            cell_fw = paddle.nn.LSTMCell(16, 32)
+            cell_bw = paddle.nn.LSTMCell(16, 32)
+            rnn = paddle.nn.BiRNN(cell_fw, cell_bw)
+
+            inputs = paddle.rand((2, 23, 16))
+            outputs, final_states = rnn(inputs)
+
+    """
+
+    def __init__(self, cell_fw, cell_bw, time_major=False):
+        super(BiRNN, self).__init__()
+        self.cell_fw = cell_fw
+        self.cell_bw = cell_bw
+        if cell_fw.input_size != cell_bw.input_size:
+            raise ValueError("input size of forward cell({}) does not equals"
+                             "that of backward cell({})".format(
+                                 cell_fw.input_size, cell_bw.input_size))
+        for cell in [self.cell_fw, self.cell_bw]:
+            if not hasattr(cell, "call"):
+                # for non-dygraph mode, `rnn` api uses cell.call
+                cell.call = cell.forward
+        self.time_major = time_major
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        if isinstance(initial_states, (list, tuple)):
+            assert len(initial_states) == 2, \
+                "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+
+        outputs, final_states = F.birnn(self.cell_fw, self.cell_bw, inputs,
+                                        initial_states, sequence_length,
+                                        self.time_major, **kwargs)
+        return outputs, final_states
+
+
+class RNNMixin(LayerList):
+    r"""
+    A Mixin class for RNN networks. It provides `forward` method for SimpleRNN,
+    LSTM and GRU.
+    """
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        batch_index = 1 if self.time_major else 0
+        dtype = inputs.dtype
+        if initial_states is None:
+            state_shape = (self.num_layers * self.num_directions, -1,
+                           self.hidden_size)
+            if self.state_components == 1:
+                initial_states = paddle.fluid.layers.fill_constant_batch_size_like(
+                    inputs, state_shape, dtype, 0, batch_index, 1)
+            else:
+                initial_states = tuple([
+                    paddle.fluid.layers.fill_constant_batch_size_like(
+                        inputs, state_shape, dtype, 0, batch_index, 1)
+                    for _ in range(self.state_components)
+                ])
+
+        states = split_states(initial_states, self.num_directions == 2,
+                              self.state_components)
+        final_states = []
+
+        for i, rnn_layer in enumerate(self):
+            if i > 0:
+                inputs = F.dropout(
+                    inputs,
+                    self.dropout,
+                    training=self.training,
+                    mode="upscale_in_train")
+            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+            final_states.append(final_state)
+            inputs = outputs
+
+        final_states = concat_states(final_states, self.num_directions == 2,
+                                     self.state_components)
+        return outputs, final_states
+
+
+class SimpleRNN(RNNMixin):
+    r"""
+    Multilayer Elman network(SimpleRNN). It takes input sequences and initial 
+    states as inputs, and returns the output sequences and the final states.
+
+    Each layer inside the SimpleRNN maps the input sequences and initial states 
+    to the output sequences and final states in the following manner: at each 
+    step, it takes step inputs(:math:`x_{t}`) and previous 
+    states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`)
+    and new states(:math:`h_{t}`).
+
+    .. math::
+
+        h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+        y_{t} & = h_{t}
+    
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Arguments:
+        input_size (int): The input size for the first layer's cell.
+        hidden_size (int): The hidden size for each layer's cell.
+        num_layers (int, optional): Number of layers. Defaults to 1.
+        activation (str, optional): The activation in each SimpleRNN cell. It can be 
+            `tanh` or `relu`. Defaults to `tanh`.
+        direction (str, optional): The direction of the network. It can be "forward", 
+            "backward" and "bidirectional". Defaults to "forward".
+        dropout (float, optional): The droput probability. Dropout is applied to the 
+            input of each layer except for the first layer. Defaults to 0.
+        time_major (bool, optional): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_ih` of each cell. Defaults to None.
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_hh` of each cell. Defaults to None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih` of each cells. Defaults to None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh` of each cells. Defaults to None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Inputs:
+        inputs (Tensor): the input sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        initial_states (Tensor, optional): the initial state. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            If initial_state is not given, zero initial states are used.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor): the output sequence. 
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, num_directions * hidden_size]`,
+            else, the shape is 
+            `[batch_size, time_steps, num_directions * hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+        final_states (Tensor): final states. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            rnn = paddle.nn.SimpleRNN(16, 32, 2)
+
+            x = paddle.randn((4, 23, 16))
+            prev_h = paddle.randn((2, 4, 32))
+            y, h = rnn(x, prev_h)
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 activation="tanh",
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(SimpleRNN, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = SimpleRNNCell(input_size, hidden_size, activation,
+                                 weight_ih_attr, weight_hh_attr, bias_ih_attr,
+                                 bias_hh_attr)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = SimpleRNNCell(hidden_size, hidden_size, activation,
+                                     weight_ih_attr, weight_hh_attr,
+                                     bias_ih_attr, bias_hh_attr)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = SimpleRNNCell(input_size, hidden_size, activation,
+                                    weight_ih_attr, weight_hh_attr,
+                                    bias_ih_attr, bias_hh_attr)
+            cell_bw = SimpleRNNCell(input_size, hidden_size, activation,
+                                    weight_ih_attr, weight_hh_attr,
+                                    bias_ih_attr, bias_hh_attr)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = SimpleRNNCell(
+                    2 * hidden_size, hidden_size, activation, weight_ih_attr,
+                    weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                cell_bw = SimpleRNNCell(
+                    2 * hidden_size, hidden_size, activation, weight_ih_attr,
+                    weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
+
+
+class LSTM(RNNMixin):
+    r"""
+    Multilayer LSTM. It takes a sequence and an initial state as inputs, and 
+    returns the output sequences and the final states.
+
+    Each layer inside the LSTM maps the input sequences and initial states 
+    to the output sequences and final states in the following manner: at each 
+    step, it takes step inputs(:math:`x_{t}`) and previous 
+    states(:math:`h_{t-1}, c_{t-1}`) as inputs, and returns step 
+    outputs(:math:`y_{t}`) and new states(:math:`h_{t}, c_{t}`).
+
+    .. math::
+
+        i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
+        f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
+        o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
+        \\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
+        c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t}
+        h_{t} & = o_{t} \* \\tanh(c_{t})
+        y_{t} & = h_{t}
+
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Arguments:
+        input_size (int): The input size for the first layer's cell.
+        hidden_size (int): The hidden size for each layer's cell.
+        num_layers (int, optional): Number of layers. Defaults to 1.
+        direction (str, optional): The direction of the network. It can be 
+            "forward", "backward" and "bidirectional". Defaults to "forward".
+        dropout (float, optional): The droput probability. Dropout is applied 
+            to the input of each layer except for the first layer. Defaults to 0.
+        time_major (bool, optional): Whether the first dimension of the input 
+            means the time steps. Defaults to False.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_ih` of each cell. Default: None.
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_hh` of each cell. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih` of each cells. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh` of each cells. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Inputs:
+        inputs (Tensor): the input sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        initial_states (tuple, optional): the initial state, a tuple of (h, c), 
+            the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            If initial_state is not given, zero initial states are used.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whos time step 
+            index are not less than the valid length are treated as paddings.
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor): the output sequence. 
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, num_directions * hidden_size]`, 
+            If `time_major` is False, the shape is 
+            `[batch_size, time_steps, num_directions * hidden_size]`. 
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1. 
+        final_states (Tensor): the final state, a tuple of two tensors, h and c. 
+            The shape of each is 
+            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            rnn = paddle.nn.LSTM(16, 32, 2)
+
+            x = paddle.randn((4, 23, 16))
+            prev_h = paddle.randn((2, 4, 32))
+            prev_c = paddle.randn((2, 4, 32))
+            y, (h, c) = rnn(x, (prev_h, prev_c))
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(LSTM, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = LSTMCell(input_size, hidden_size, weight_ih_attr,
+                            weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = LSTMCell(hidden_size, hidden_size, weight_ih_attr,
+                                weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = LSTMCell(input_size, hidden_size, weight_ih_attr,
+                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            cell_bw = LSTMCell(input_size, hidden_size, weight_ih_attr,
+                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                   weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                cell_bw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                   weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 2
+
+
+class GRU(RNNMixin):
+    r"""
+    Multilayer GRU. It takes input sequencse and initial states as inputs, and 
+    returns the output sequences and the final states.
+
+    Each layer inside the GRU maps the input sequences and initial states 
+    to the output sequences and final states in the following manner: at each 
+    step, it takes step inputs(:math:`x_{t}`) and previous 
+    states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`) 
+    and new states(:math:`h_{t}`).
+
+    .. math::
+
+        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
+        z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
+        \\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc}))
+        h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t}
+        y_{t} & = h_{t}
+
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Arguments:
+        input_size (int): The input size for the first layer's cell.
+        hidden_size (int): The hidden size for each layer's cell.
+        num_layers (int, optional): Number of layers. Defaults to 1.
+        direction (str, optional): The direction of the network. It can be 
+            "forward", "backward" and "bidirectional". Defaults to "forward".
+        dropout (float, optional): The droput probability. Dropout is applied 
+            to the input of each layer except for the first layer. Defaults to 0.
+        time_major (bool, optional): Whether the first dimension of the input 
+            means the time steps. Defaults to False.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_ih` of each cell. Default: None.
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_hh` of each cell. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih` of each cells. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh` of each cells. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Inputs:
+        inputs (Tensor): the input sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        initial_states (Tensor, optional): the initial state. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            If initial_state is not given, zero initial states are used. 
+            Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whos time step 
+            index are not less than the valid length are treated as paddings.
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor): the output sequence. 
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, num_directions * hidden_size]`,
+            else, the shape is 
+            `[batch_size, time_steps, num_directions * hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+        final_states (Tensor): final states. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            rnn = paddle.nn.GRU(16, 32, 2)
+
+            x = paddle.randn((4, 23, 16))
+            prev_h = paddle.randn((2, 4, 32))
+            y, h = rnn(x, prev_h)
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(GRU, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = GRUCell(input_size, hidden_size, weight_ih_attr,
+                           weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = GRUCell(hidden_size, hidden_size, weight_ih_attr,
+                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = GRUCell(input_size, hidden_size, weight_ih_attr,
+                              weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            cell_bw = GRUCell(input_size, hidden_size, weight_ih_attr,
+                              weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                  weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                cell_bw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                  weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 2b926b5ab369046fc07c3d3d8cd56431d7f740a7..50a8755ac9f7b0a8e35c60f02a9fb825195ab80f 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -13,4 +13,1107 @@
 # limitations under the License.
 
 # TODO: define the classes of Transformer neural network
-# __all__ = [ ]
+__all__ = [
+    'MultiHeadAttention',
+    'TransformerEncoderLayer',
+    'TransformerEncoder',
+    'TransformerDecoderLayer',
+    'TransformerDecoder',
+    'Transformer',
+]
+
+import copy
+import collections
+
+from ...fluid import layers
+from ...fluid.param_attr import ParamAttr
+from ...fluid.dygraph import Layer, Linear, Dropout, LayerNorm, LayerList
+from .. import functional as F
+from ...fluid.layers import utils
+from ...fluid.layers.utils import map_structure
+
+
+def _convert_param_attr_to_list(param_attr, n):
+    """
+    If `param_attr` is a list or tuple, convert every element in it to a
+    ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
+    construct a list, and rename every one by appending a increasing index
+    suffix to avoid having same names when `param_attr` contains a name.
+
+    Parameters:
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`.
+        n (int): The times to repeat to construct a list when `param_attr`
+            is not a list or tuple.
+
+    Returns:
+        list: A list composed of each including cell's `param_attr`.
+    """
+    if isinstance(param_attr, (list, tuple)):
+        assert len(param_attr) == n, (
+            "length of param_attr should be %d when it is a list/tuple" % n)
+        param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+    else:
+        param_attrs = []
+        attr = ParamAttr._to_attr(param_attr)
+        for i in range(n):
+            attr_i = copy.deepcopy(attr)
+            if attr.name:
+                attr_i.name = attr_i.name + "_" + str(i)
+            param_attrs.append(attr_i)
+    return param_attrs
+
+
+class MultiHeadAttention(Layer):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces.
+
+    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
+    for more details.
+
+    Parameters:
+        embed_dim (int): The expected feature size in the input and output.
+        num_heads (int): The number of heads in multi-head attention.
+        dropout (float, optional): The dropout probability used on attention
+            weights to drop some attention targets. 0 for no dropout. Default 0
+        kdim (int, optional): The feature size in key. If None, assumed equal to
+            `embed_dim`. Default None.
+        vdim (int, optional): The feature size in value. If None, assumed equal to
+            `embed_dim`. Default None.
+        need_weights (bool, optional): Indicate whether to return the attention
+            weights. Default False.
+        weight_attr(ParamAttr, optional):  To specify the weight parameter property.
+            Default: None, which means the default weight parameter property is used.
+            See usage for details in :code:`ParamAttr` .
+        bias_attr (ParamAttr, optional): To specify the bias parameter property.
+            Default: None, which means the default bias parameter property is used.
+            If it is set to False, this layer will not have trainable bias parameter.
+            See usage for details in :code:`ParamAttr` .
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            # encoder input: [batch_size, sequence_length, d_model]
+            query = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, num_heads, query_len, query_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            multi_head_attn = paddle.MultiHeadAttention(128, 2)
+            output = multi_head_attn(query, attn_mask=attn_mask)  # [2, 4, 128]
+    """
+
+    Cache = collections.namedtuple("Cache", ["k", "v"])
+    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])
+
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout=0.,
+                 kdim=None,
+                 vdim=None,
+                 need_weights=False,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(MultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.need_weights = need_weights
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.q_proj = Linear(
+            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
+        self.k_proj = Linear(
+            self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
+        self.v_proj = Linear(
+            self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
+        self.out_proj = Linear(
+            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
+
+    def _prepare_qkv(self, query, key, value, cache=None):
+        """
+        Prapares linear projected queries, keys and values for usage of subsequnt
+        multiple parallel attention. If `cache` is not None, using cached results
+        to reduce redundant calculations.
+
+        Parameters:
+            query (Tensor): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, query_length, embed_dim]`. The
+                data type should be float32 or float64.
+            key (Tensor): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If None, use `query` as
+                `key`.
+            value (Tensor): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, use `query` as
+                `value`.
+            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
+                It is a namedtuple with `k` and `v` as fields, and stores tensors
+                shaped `[batch_size, num_heads, length, embed_dim]` which are results
+                of linear projection, reshape and transpose calculations in
+                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`
+                fields reserve intermediate results of previous positions, which
+                mostly used for decoder self attention. If it is an instance of
+                `StaticCache`, `key` and `value` args would be ignored, `k` and
+                `v` fields would be used as calculated results on `key` and
+                `value`, which mostly used for decoder-encoder cross attention.
+                It is only used for inference and should be None for training.
+                Default None.
+
+        Returns:
+            tuple: A tuple including linear projected keys and values. These two \
+                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
+                and `[batch_size, n_head, sequence_length, d_value]` separately, \
+                and their data types are same as inputs.
+        """
+        q = self.q_proj(query)
+        q = layers.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
+        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
+
+        if isinstance(cache, self.StaticCache):
+            # for encoder-decoder attention in inference and has cached
+            k, v = cache.k, cache.v
+        else:
+            k, v = self.compute_kv(key, value)
+
+        if isinstance(cache, self.Cache):
+            # for decoder self-attention in inference
+            k = layers.concat([cache.k, k], axis=2)
+            v = layers.concat([cache.v, v], axis=2)
+            cache = self.Cache(k, v)
+
+        return (q, k, v) if cache is None else (q, k, v, cache)
+
+    def compute_kv(self, key, value):
+        """
+        Applies linear projection on input keys and values, then splits heads
+        (reshape and transpose) to get keys and values from different representation
+        subspaces. The results are used as key-values pairs for subsequent multiple
+        parallel attention.
+        
+        It is part of calculations in multi-head attention, and is provided as
+        a method to pre-compute and prefetch these results, thus we can use them
+        to construct cache for inference.
+
+        Parameters:
+            key (Tensor): The keys for multi-head attention. It is a tensor
+                with shape `[batch_size, sequence_length, kdim]`. The data type
+                should be float32 or float64.
+            value (Tensor): The values for multi-head attention. It is a tensor
+                with shape `[batch_size, sequence_length, vdim]`. The data type
+                should be float32 or float64.
+
+        Returns:
+            tuple: A tuple including transformed keys and values. Their shapes \
+                both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \
+                and their data types are same as inputs.
+        """
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        k = layers.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
+        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+        v = layers.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
+        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        return k, v
+
+    def gen_cache(self, key, value=None, type=Cache):
+        """
+        Generates cache for `forward` usage in inference accroding to arguments.
+        The generated cache is an instance of `MultiHeadAttention.Cache` or an
+        instance of `MultiHeadAttention.StaticCache`.
+
+        `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields,
+        and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]`
+        which are results of linear projection, reshape and transpose calculations
+        in MultiHeadAttention.
+        
+        If the generated cache is an instance of `Cache`, `k` and `v` fields
+        reserve intermediate result tensors of previous positions, and the tensors
+        are incremental among decoding steps, which mostly are used for decoder
+        decoder self attention.
+        
+        If the generated cache is an instance of `StaticCache`, `k` and `v` fields
+        would be used as calculated result tensors on keys an values in `forward`,
+        and the tensors keep unchanged among decoding steps, which are mostly used
+        for decoder-encoder cross attention.
+
+        The cache is generated as follows:
+
+        1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the
+        results to create an instance of `StaticCache`.
+        
+        2. If `type` is `Cache` and `value` is None, generate empty tensors shaped
+        `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results
+        to create an instance of `Cache`, where `batch_size` is from the first
+        dimension of `key`.
+
+        3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create
+        an instance of `Cache`.
+
+        Parameters:
+            key (Tensor): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If `value` is None,
+                it is only for batch size and data type reference.
+            value (Tensor, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, `key` is only
+                for batch size reference. Default None.
+            type (type): It should be `MultiHeadAttention.StaticCache` or
+                `MultiHeadAttention.Cache` to indicate the cache type to generate.
+        
+        Returns:
+            namedtuple: an instance of `Cache` or `StaticCache` accordingly.
+        """
+        if type == MultiHeadAttention.StaticCache:  # static_kv
+            k, v = self.compute_kv(key, value)
+            return self.StaticCache(k, v)
+        elif value is None:  # incremental_state
+            k = layers.fill_constant_batch_size_like(
+                input=key,
+                shape=[-1, self.num_heads, 0, self.head_dim],
+                dtype=key.dtype,
+                value=0)
+            v = layers.fill_constant_batch_size_like(
+                input=key,
+                shape=[-1, self.num_heads, 0, self.head_dim],
+                dtype=key.dtype,
+                value=0)
+            return self.Cache(k, v)
+        else:
+            # incremental_state with initial value, mainly for usage like UniLM
+            return self.Cache(key, value)
+
+    def forward(self, query, key, value, attn_mask=None, cache=None):
+        """
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+
+        Parameters:
+            query (Tensor): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, query_length, embed_dim]`. The
+                data type should be float32 or float64.
+            key (Tensor, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If None, use `query` as
+                `key`. Default None.
+            value (Tensor, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, use `query` as
+                `value`. Default None.
+            attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
+                It is a namedtuple with `k` and `v` as fields, and stores tensors
+                shaped `[batch_size, num_heads, length, embed_dim]` which are results
+                of linear projection, reshape and transpose calculations in
+                MultiHeadAttention. If it is an instance of `Cache`, `k` and `v`
+                fields reserve intermediate results of previous positions, which
+                mostly used for decoder self attention. If it is an instance of
+                `StaticCache`, `key` and `value` args would be ignored, `k` and
+                `v` fields would be used as calculated results on `key` and
+                `value`, which mostly used for decoder-encoder cross attention.
+                It is only used for inference and should be None for training.
+                Default None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `query`, representing attention output. Or a tuple if \
+                `need_weights` is True or `cache` is not None. If `need_weights` \
+                is True, except for attention output, the tuple also includes \
+                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
+                If `cache` is not None, the tuple then includes the new cache \
+                having the same type as `cache`, and if it is `StaticCache`, it \
+                is same as the input `cache`, if it is `Cache`, the new cache \
+                reserves tensors concatanating raw tensors with intermediate \
+                results of current query.
+        """
+        key = query if key is None else key
+        value = query if value is None else value
+        # compute q ,k ,v
+        if cache is None:
+            q, k, v = self._prepare_qkv(query, key, value, cache)
+        else:
+            q, k, v, cache = self._prepare_qkv(query, key, value, cache)
+
+        # scale dot product attention
+        product = layers.matmul(
+            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
+        if attn_mask is not None:
+            # TODO(guosheng): support bool mask
+            product = product + attn_mask
+        weights = layers.softmax(product)
+        if self.dropout:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=self.dropout,
+                dropout_implementation="upscale_in_train",
+                is_test=False)
+
+        out = layers.matmul(weights, v)
+
+        # combine heads
+        out = layers.transpose(out, perm=[0, 2, 1, 3])
+        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+
+        outs = [out]
+        if self.need_weights:
+            outs.append(weights)
+        if cache is not None:
+            outs.append(cache)
+        return out if len(outs) == 1 else tuple(outs)
+
+
+class TransformerEncoderLayer(Layer):
+    """
+    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
+    attention and feedforward network. Before and after each sub-layer, pre-process
+    and post-precess would be applied on the input and output accordingly. If
+    `normalize_before` is True, pre-process is layer normalization and post-precess
+    includes dropout, residual connection. Otherwise, no pre-process and post-precess
+    includes dropout, residual connection, layer normalization.
+
+    Parameters:
+        d_model (int): The expected feature size in the input and output.
+        nhead (int): The number of heads in multi-head attention(MHA).
+        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        dropout (float, optional): The dropout probability used in pre-process
+            and post-precess of MHA and FFN sub-layer. Default 0.1
+        activation (str, optional): The activation function in the feedforward
+            network. Default relu.
+        attn_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. If None, use the value of
+            `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
+            normalization and post-precess includes dropout, residual connection.
+            Otherwise, no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Default False
+        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
+            Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
+            Default: None, which means the default weight parameter property is used.
+            See usage for details in :code:`ParamAttr` . 
+        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
+            Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
+            The `False` value means the corresponding layer would not have trainable
+            bias parameter. See usage for details in :code:`ParamAttr` . Default: None,
+            which means the default bias parameter property is used.
+            
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerEncoderLayer
+
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, n_head, src_len, src_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            encoder_layer = TransformerEncoderLayer(128, 2, 512)
+            enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 weight_attr=None,
+                 bias_attr=None):
+        self._config = locals()
+        self._config.pop("self")
+        self._config.pop("__class__", None)  # py3
+
+        super(TransformerEncoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
+        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)
+
+        self.self_attn = MultiHeadAttention(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            weight_attr=weight_attrs[0],
+            bias_attr=bias_attrs[0])
+        self.linear1 = Linear(
+            d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])
+        self.dropout = Dropout(
+            act_dropout, dropout_implementation="upscale_in_train")
+        self.linear2 = Linear(
+            dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.dropout1 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.dropout2 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.activation = getattr(layers, activation)
+
+    def forward(self, src, src_mask=None):
+        """
+        Applies a Transformer encoder layer on the input.
+
+        Parameters:
+            src (Tensor): The input of Transformer encoder layer. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+            src_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+
+        Returns:
+            Tensor: The output of Transformer encoder layer. It is a tensor that \
+                has the same shape and data type as `enc_input`.
+        """
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        # TODO(guosheng): Add cache for encoder for the usage like UniLM
+        src = self.self_attn(src, src, src, src_mask)
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(Layer):
+    """
+    TransformerEncoder is a stack of N encoder layers. 
+
+    Parameters:
+        encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It
+            would be used as the first layer, and the other layers would be created
+            according to the configurations of it.
+        num_layers (int): The number of encoder layers to be stacked.
+        norm (LayerNorm, optional): the layer normalization component. If provided,
+            apply layer normalization on the output of last encoder layer.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerEncoderLayer, TransformerEncoder
+
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, n_head, src_len, src_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            encoder_layer = TransformerEncoderLayer(128, 2, 512)
+            encoder = TransformerEncoder(encoder_layer, 2)
+            enc_output = encoder(enc_input, attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = LayerList([(encoder_layer if i == 0 else
+                                  type(encoder_layer)(**encoder_layer._config))
+                                 for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None):
+        """
+        Applies a stack of N Transformer encoder layers on inputs. If `norm` is
+        provided, also applies layer normalization on the output of last encoder
+        layer.
+
+        Parameters:
+            src (Tensor): The input of Transformer encoder. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data
+                type should be float32 or float64.
+            src_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+
+        Returns:
+            Tensor: The output of Transformer encoder. It is a tensor that \
+                has the same shape and data type as `src`.
+        """
+        output = src
+
+        for mod in self.layers:
+            output = mod(output, src_mask=src_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoderLayer(Layer):
+    """
+    TransformerDecoderLayer is composed of three sub-layers which are decoder
+    self (multi-head) attention, decoder-encoder cross attention and feedforward
+    network. Before and after each sub-layer, pre-process and post-precess would
+    be applied on the input and output accordingly. If `normalize_before` is True,
+    pre-process is layer normalization and post-precess includes dropout, residual
+    connection. Otherwise, no pre-process and post-precess includes dropout, residual
+    connection, layer normalization.
+
+    Parameters:
+        d_model (int): The expected feature size in the input and output.
+        nhead (int): The number of heads in multi-head attention(MHA).
+        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        dropout (float, optional): The dropout probability used in pre-process
+            and post-precess of MHA and FFN sub-layer. Default 0.1
+        activation (str, optional): The activation function in the feedforward
+            network. Default relu.
+        attn_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. If None, use the value of
+            `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
+            normalization and post-precess includes dropout, residual connection.
+            Otherwise, no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Default False
+        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            self attention, `weight_attr[1]` would be used as `weight_attr` for
+            cross attention, and `weight_attr[2]` would be used as `weight_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `weight_attr` to create parameters. Default: None, which means the
+            default weight parameter property is used. See usage for details
+            in :ref:`api_fluid_ParamAttr` . 
+        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            self attention, `bias_attr[1]` would be used as `bias_attr` for
+            cross attention, and `bias_attr[2]` would be used as `bias_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `bias_attr` to create parameters. The `False` value means the
+            corresponding layer would not have trainable bias parameter. See
+            usage for details in :code:`ParamAttr` . Default: None,which means
+            the default bias parameter property is used.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerDecoderLayer
+
+            # decoder input: [batch_size, tgt_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
+            self_attn_mask = paddle.rand((2, 2, 4, 4))
+            # cross attention mask: [batch_size, n_head, tgt_len, src_len]
+            cross_attn_mask = paddle.rand((2, 2, 4, 6))
+            decoder_layer = TransformerDecoderLayer(128, 2, 512)
+            output = decoder_layer(dec_input,
+                                   enc_output,
+                                   self_attn_mask,
+                                   cross_attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 weight_attr=None,
+                 bias_attr=None):
+        self._config = locals()
+        self._config.pop("self")
+        self._config.pop("__class__", None)  # py3
+
+        super(TransformerDecoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
+        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
+
+        self.self_attn = MultiHeadAttention(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            weight_attr=weight_attrs[0],
+            bias_attr=bias_attrs[0])
+        self.cross_attn = MultiHeadAttention(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            weight_attr=weight_attrs[1],
+            bias_attr=bias_attrs[1])
+        self.linear1 = Linear(
+            d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])
+        self.dropout = Dropout(
+            act_dropout, dropout_implementation="upscale_in_train")
+        self.linear2 = Linear(
+            dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.norm3 = LayerNorm(d_model)
+        self.dropout1 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.dropout2 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.dropout3 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.activation = getattr(layers, activation)
+
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
+        """
+        Applies a Transformer decoder layer on the input.
+
+        Parameters:
+            tgt (Tensor): The input of Transformer decoder layer. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt_mask (Tensor, optional): A tensor used in self attention
+                to prevents attention to some unwanted positions, usually the
+                the subsequent positions. It is a tensor with shape broadcasted
+                to `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            memory_mask (Tensor, optional): A tensor used in decoder-encoder
+                cross attention to prevents attention to some unwanted positions,
+                usually the paddings. It is a tensor with shape broadcasted to
+               `[batch_size, n_head, target_length, source_length]`, where the
+                unwanted positions have `-INF` values and the others have 0 values.
+                The data type should be float32 or float64. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None
+            cache (tuple, optional): It is a tuple( :code:`(incremental_cache, static_cache)` ),
+                `incremental_cache` is an instance of `MultiHeadAttention.Cache`,
+                `static_cache` is an instance of `MultiHeadAttention.StaticCache.
+                See `TransformerDecoderLayer.gen_cache` for more details. It is
+                only used for inference and should be None for training. Default
+                None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `tgt`, representing the output of Transformer decoder layer. \
+                Or a tuple if `cache` is not None, except for decoder layer output, \
+                the tuple includes the new cache which is same as input `cache` \
+                argument but `incremental_cache` in it has an incremental length. \
+                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
+                for more details.
+        """
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        if cache is None:
+            tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, None)
+        else:
+            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
+                                                    cache[0])
+        tgt = residual + self.dropout1(tgt)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm2(tgt)
+        if cache is None:
+            tgt = self.cross_attn(tgt, memory, memory, memory_mask, None)
+        else:
+            tgt, static_cache = self.cross_attn(tgt, memory, memory,
+                                                memory_mask, cache[1])
+        tgt = residual + self.dropout2(tgt)
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm3(tgt)
+        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = residual + self.dropout3(tgt)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt if cache is None else (tgt, (incremental_cache,
+                                                static_cache))
+
+    def gen_cache(self, memory):
+        """
+        Generates cache for `forward` usage. The generated cache is a tuple
+        composed of an instance of `MultiHeadAttention.Cache` and an instance
+        of `MultiHeadAttention.StaticCache`.
+
+        Parameters:
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+
+        Returns:
+            tuple: It is a tuple( :code:`(incremental_cache, static_cache)` ). \
+                `incremental_cache` is an instance of `MultiHeadAttention.Cache` \
+                produced by `self_attn.gen_cache(memory, MultiHeadAttention.Cache)`, \
+                it reserves two tensors shaped `[batch_size, nhead, 0, d_model // nhead]`. \
+                `static_cache` is an instance of `MultiHeadAttention.StaticCache` \
+                produced by `cross_attn.gen_cache(memory, MultiHeadAttention.StaticCache)`, \
+                it reserves two tensors shaped `[batch_size, nhead, source_length, d_model // nhead]`.
+                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
+                for more details.
+        """
+        incremental_cache = self.self_attn.gen_cache(
+            memory, type=self.self_attn.Cache)
+        static_cache = self.cross_attn.gen_cache(
+            memory, memory, type=self.cross_attn.StaticCache)
+        return incremental_cache, static_cache
+
+
+class TransformerDecoder(Layer):
+    """
+    TransformerDecoder is a stack of N decoder layers. 
+
+    Parameters:
+        decoder_layer (Layer): an instance of the `TransformerDecoderLayer`. It
+            would be used as the first layer, and the other layers would be created
+            according to the configurations of it.
+        num_layers (int): The number of decoder layers to be stacked.
+        norm (LayerNorm, optional): the layer normalization component. If provided,
+            apply layer normalization on the output of last encoder layer.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerDecoderLayer, TransformerDecoder
+
+            # decoder input: [batch_size, tgt_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
+            self_attn_mask = paddle.rand((2, 2, 4, 4))
+            # cross attention mask: [batch_size, n_head, tgt_len, src_len]
+            cross_attn_mask = paddle.rand((2, 2, 4, 6))
+            decoder_layer = TransformerDecoderLayer(128, 2, 512)
+            decoder = TransformerDecoder(decoder_layer, 2)
+            output = decoder(dec_input,
+                             enc_output,
+                             self_attn_mask,
+                             cross_attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self, decoder_layer, num_layers, norm=None):
+        super(TransformerDecoder, self).__init__()
+        self.layers = LayerList([(decoder_layer if i == 0 else
+                                  type(decoder_layer)(**decoder_layer._config))
+                                 for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
+        """
+        Applies a stack of N Transformer decoder layers on inputs. If `norm` is
+        provided, also applies layer normalization on the output of last decoder
+        layer.
+
+        Parameters:
+            tgt (Tensor): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt_mask (Tensor, optional): A tensor used in self attention
+                to prevents attention to some unwanted positions, usually the
+                the subsequent positions. It is a tensor with shape broadcasted
+                to `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            memory_mask (Tensor, optional): A tensor used in decoder-encoder
+                cross attention to prevents attention to some unwanted positions,
+                usually the paddings. It is a tensor with shape broadcasted to
+               `[batch_size, n_head, target_length, source_length]`, where the
+                unwanted positions have `-INF` values and the others have 0 values.
+                The data type should be float32 or float64. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None
+            cache (list, optional): It is a list, and each element in the list
+                is a tuple( :code:`(incremental_cache, static_cache)` ). See
+                `TransformerDecoder.gen_cache` for more details. It is only
+                used for inference and should be None for training. Default None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `tgt`, representing the output of Transformer decoder. \
+                Or a tuple if `cache` is not None, except for decoder output, \
+                the tuple includes the new cache which is same as input `cache` \
+                argument but `incremental_cache` in it has an incremental length. \
+                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
+                for more details.
+        """
+        output = tgt
+        new_caches = []
+        for i, mod in enumerate(self.layers):
+            if cache is None:
+                output = mod(output,
+                             memory,
+                             tgt_mask=tgt_mask,
+                             memory_mask=memory_mask,
+                             cache=None)
+            else:
+                output, new_cache = mod(output,
+                                        memory,
+                                        tgt_mask=tgt_mask,
+                                        memory_mask=memory_mask,
+                                        cache=cache[i])
+                new_caches.append(new_cache)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output if cache is None else (output, new_caches)
+
+    def gen_cache(self, memory, do_zip=False):
+        """
+        Generates cache for `forward` usage. The generated cache is a list, and
+        each element in it is a tuple( :code:`(incremental_cache, static_cache)` )
+        produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`
+        for more details. If `do_zip` is True, apply `zip` on these tuples to get
+        a list with two elements.
+
+
+        Parameters:
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            do_zip (bool, optional): Indicate whether to apply `zip` on the tuples.
+                If True, return a list with two elements. Default False
+
+        Returns:
+            list: It is a list, and each element in the list is a tuple produced \
+                by `TransformerDecoderLayer.gen_cache(memory)`. See `TransformerDecoderLayer.gen_cache` \
+                for more details. If `do_zip` is True, apply `zip` on these tuples \
+                and return a list with two elements.
+        """
+        cache = [layer.gen_cache(memory) for layer in self.layers]
+        if do_zip:
+            cache = list(zip(*cache))
+        return cache
+
+
+class Transformer(Layer):
+    """
+    A Transformer model composed of an instance of `TransformerEncoder` and an
+    instance of `TransformerDecoder`. While the embedding layer and output layer
+    are not included.
+
+    Please refer to `Attention is all you need <http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf>`_ ,
+    and see `TransformerEncoder` and `TransformerDecoder` for more details.
+    
+    Users can configurate the model architecture with corresponding parameters.
+    Note the usage of `normalize_before` representing where to apply layer
+    normalization (in pre-process or post-precess of multi-head attention or FFN),
+    and some transformer like models are different on this, such as
+    `BERT <https://arxiv.org/abs/1810.04805>`_ and `GPT2 <https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf>`_ . 
+    The default architecture here places layer normalization in post-process and
+    applies another layer normalization on the output of last encoder/decoder layer.
+
+    Parameters:
+        d_model (int): The expected feature size in the encoder/decoder input
+            and output.
+        nhead (int): The number of heads in multi-head attention(MHA).
+        num_encoder_layers (int): The number of layers in encoder.
+        num_encoder_layers (int): The number of layers in decoder.
+        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        dropout (float, optional): The dropout probability used in pre-process
+            and post-precess of MHA and FFN sub-layer. Default 0.1
+        activation (str, optional): The activation function in the feedforward
+            network. Default relu.
+        attn_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. If None, use the value of
+            `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
+            normalization and post-precess includes dropout, residual connection.
+            Otherwise, no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Default False
+        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            self attention, `weight_attr[1]` would be used as `weight_attr` for
+            cross attention, and `weight_attr[2]` would be used as `weight_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `weight_attr` to create parameters. Default: None, which means the
+            default weight parameter property is used. See usage for details
+            in :code:`ParamAttr` . 
+        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            self attention, `bias_attr[1]` would be used as `bias_attr` for
+            cross attention, and `bias_attr[2]` would be used as `bias_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `bias_attr` to create parameters. The `False` value means the
+            corresponding layer would not have trainable bias parameter. See
+            usage for details in :code:`ParamAttr` . Default: None,which means
+            the default bias parameter property is used.
+        custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
+            Default None
+        custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
+            Default None
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import Transformer
+
+            # src: [batch_size, tgt_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # tgt: [batch_size, src_len, d_model]
+            dec_input = paddle.rand((2, 6, 128))
+            # src_mask: [batch_size, n_head, src_len, src_len]
+            enc_self_attn_mask = paddle.rand((2, 2, 4, 4))
+            # tgt_mask: [batch_size, n_head, tgt_len, tgt_len]
+            dec_self_attn_mask = paddle.rand((2, 2, 6, 6))
+            # memory_mask: [batch_size, n_head, tgt_len, src_len]
+            cross_attn_mask = paddle.rand((2, 2, 6, 4))
+            transformer = Transformer(128, 2, 4, 4, 512)
+            output = transformer(enc_input,
+                                 dec_input,
+                                 enc_self_attn_mask,
+                                 dec_self_attn_mask,
+                                 cross_attn_mask)  # [2, 6, 128]
+    """
+
+    def __init__(self,
+                 d_model=512,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 weight_attr=None,
+                 bias_attr=None,
+                 custom_encoder=None,
+                 custom_decoder=None):
+        super(Transformer, self).__init__()
+
+        if custom_encoder is not None:
+            self.encoder = custom_encoder
+        else:
+            encoder_layer = TransformerEncoderLayer(
+                d_model, nhead, dim_feedforward, dropout, activation,
+                attn_dropout, act_dropout, normalize_before, weight_attr,
+                bias_attr)
+            encoder_norm = LayerNorm(d_model)
+            self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
+                                              encoder_norm)
+
+        if custom_decoder is not None:
+            self.decoder = custom_decoder
+        else:
+            decoder_layer = TransformerDecoderLayer(
+                d_model, nhead, dim_feedforward, dropout, activation,
+                attn_dropout, act_dropout, normalize_before, weight_attr,
+                bias_attr)
+            decoder_norm = LayerNorm(d_model)
+            self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
+                                              decoder_norm)
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+    def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
+        """
+        Applies a Transformer model on the inputs.
+
+        Parameters:
+            src (Tensor): The input of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt (Tensor): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt_mask (Tensor, optional): A tensor used in self attention
+                to prevents attention to some unwanted positions, usually the
+                the subsequent positions. It is a tensor with shape broadcasted
+                to `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            memory_mask (Tensor, optional): A tensor used in decoder-encoder
+                cross attention to prevents attention to some unwanted positions,
+                usually the paddings. It is a tensor with shape broadcasted to
+               `[batch_size, n_head, target_length, source_length]`, where the
+                unwanted positions have `-INF` values and the others have 0 values.
+                The data type should be float32 or float64. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None
+
+        Returns:
+            Tensor: It is a tensor that has the same shape and data type \
+                as `tgt`, representing the output of Transformer decoder.
+        """
+        memory = self.encoder(src, src_mask=src_mask)
+        output = self.decoder(
+            tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask)
+        return output
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5f360ec02e6d8b59b80db4602776e904cf0b499
--- /dev/null
+++ b/python/paddle/nn/layer/vision.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: define specitial functions used in computer vision task 
+
+from ...fluid.dygraph import layers
+from .. import functional
+
+__all__ = ['PixelShuffle']
+
+
+class PixelShuffle(layers.Layer):
+    """
+    
+    PixelShuffle Layer    
+
+    This operator rearranges elements in a tensor of shape [N, C, H, W]
+    to a tensor of shape [N, C/upscale_factor**2, H*upscale_factor, W*upscale_factor],
+    or from shape [N, H, W, C] to [N, H*upscale_factor, W*upscale_factor, C/upscale_factor**2].
+    This is useful for implementing efficient sub-pixel convolution
+    with a stride of 1/upscale_factor.
+    Please refer to the paper: `Real-Time Single Image and Video Super-Resolution
+    Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_ .
+    by Shi et. al (2016) for more details.
+
+    Parameters:
+
+        upscale_factor(int): factor to increase spatial resolution.
+        data_format (str): The data format of the input and output data. An optional string from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in the order of: [batch_size, input_channels, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - x: 4-D tensor with shape: (N, C, H, W) or (N, H, W, C).
+        - out: 4-D tensor with shape: (N, C/upscale_factor**2, H*upscale_factor, W*upscale_factor) or (N, H*upscale_factor, W*upscale_factor, C/upscale_factor^2).
+
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.randn(2, 9, 4, 4).astype(np.float32)
+            x_var = paddle.to_tensor(x)
+            pixel_shuffle = nn.PixelShuffle(3)
+            out_var = pixel_shuffle(x_var)
+            out = out_var.numpy()
+            print(out.shape) 
+            # (2, 1, 12, 12)
+
+    """
+
+    def __init__(self, upscale_factor, data_format="NCHW", name=None):
+        super(PixelShuffle, self).__init__()
+
+        if not isinstance(upscale_factor, int):
+            raise TypeError("upscale factor must be int type")
+
+        if data_format not in ["NCHW", "NHWC"]:
+            raise ValueError("Data format should be 'NCHW' or 'NHWC'."
+                             "But recevie data format: {}".format(data_format))
+
+        self._upscale_factor = upscale_factor
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return functional.pixel_shuffle(x, self._upscale_factor,
+                                        self._data_format, self._name)
diff --git a/python/paddle/nn/input.py b/python/paddle/nn/utils/__init__.py
similarity index 81%
rename from python/paddle/nn/input.py
rename to python/paddle/nn/utils/__init__.py
index b5f591f44a9a167dcba8e4e46322ca157a5e48cb..6562ac35e1e3180db671f90188f1304f07864189 100644
--- a/python/paddle/nn/input.py
+++ b/python/paddle/nn/utils/__init__.py
@@ -12,10 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define input placeholders of neural network  
-from ..fluid import data  #DEFINE_ALIAS
-
-__all__ = [
-    'data',
-    #       'Input'
-]
+from . import weight_norm_hook
+from .weight_norm_hook import weight_norm, remove_weight_norm
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad53bf394660f3a7e0e48fdbd5eb530abd0852bb
--- /dev/null
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from ... import fluid
+from ...fluid import dygraph
+from ...fluid import layers as F
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.data_feeder import check_variable_and_dtype
+from ...tensor.math import multiply
+
+__all__ = ['weight_norm', 'remove_weight_norm']
+
+
+def l2_norm(x, axis, epsilon=1e-12, name=None):
+    if len(x.shape) == 1:
+        axis = 0
+    check_variable_and_dtype(x, "X", ("float32", "float64"), "norm")
+
+    helper = LayerHelper("l2_normalize", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    norm = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="norm",
+        inputs={"X": x},
+        outputs={"Out": out,
+                 "Norm": norm},
+        attrs={
+            "axis": 1 if axis is None else axis,
+            "epsilon": epsilon,
+        })
+    return F.squeeze(norm, axes=[axis])
+
+
+def norm_except_dim(p, dim):
+    shape = p.shape
+    ndims = len(shape)
+    if dim == -1:
+        return F.sqrt(F.reduce_sum(F.square(p)) + 1e-12)
+    elif dim == 0:
+        p_matrix = F.reshape(p, (shape[0], -1))
+        return l2_norm(p_matrix, axis=1)
+    elif dim == ndims - 1:
+        p_matrix = F.reshape(p, (-1, shape[-1]))
+        return l2_norm(p_matrix, axis=0)
+    else:
+        perm = list(range(ndims))
+        perm[0] = dim
+        perm[dim] = 0
+        p_transposed = F.transpose(p, perm)
+        return norm_except_dim(p_transposed, 0)
+
+
+def _weight_norm(v, g, dim):
+    shape = v.shape
+    ndims = len(shape)
+
+    if dim == -1:
+        v_normalized = v / (F.sqrt(F.reduce_sum(F.square(v))) + 1e-12)
+    elif dim == 0:
+        p_matrix = F.reshape(v, (shape[0], -1))
+        v_normalized = F.l2_normalize(p_matrix, axis=1)
+        v_normalized = F.reshape(v_normalized, shape)
+    elif dim == ndims - 1:
+        p_matrix = F.reshape(v, (-1, shape[-1]))
+        v_normalized = F.l2_normalize(p_matrix, axis=0)
+        v_normalized = F.reshape(v_normalized, shape)
+    else:
+        perm = list(range(ndims))
+        perm[0] = dim
+        perm[dim] = 0
+        p_transposed = F.transpose(v, perm)
+        transposed_shape = p_transposed.shape
+        p_matrix = F.reshape(p_transposed, (p_transposed.shape[0], -1))
+        v_normalized = F.l2_normalize(p_matrix, axis=1)
+        v_normalized = F.reshape(v_normalized, transposed_shape)
+        v_normalized = F.transpose(v_normalized, perm)
+    weight = multiply(v_normalized, g, axis=dim if dim is not None else -1)
+    return weight
+
+
+class WeightNorm(object):
+    def __init__(self, name, dim):
+        if dim is None:
+            dim = -1
+        self.name = name
+        self.dim = dim
+
+    def compute_weight(self, layer):
+        g = getattr(layer, self.name + '_g')
+        v = getattr(layer, self.name + '_v')
+        return _weight_norm(v, g, self.dim)
+
+    @staticmethod
+    def apply(layer, name, dim):
+        for k, hook in layer._forward_pre_hooks.items():
+            if isinstance(hook, WeightNorm) and hook.name == name:
+                raise RuntimeError("Cannot register two weight_norm hooks on "
+                                   "the same parameter {}".format(name))
+
+        if dim is None:
+            dim = -1
+
+        fn = WeightNorm(name, dim)
+
+        w = getattr(layer, name)
+        del layer._parameters[name]
+
+        g_var = norm_except_dim(w, dim)
+        v = layer.create_parameter(w.shape, dtype=w.dtype)
+        layer.add_parameter(name + "_v", v)
+        g = layer.create_parameter(g_var.shape, dtype=g_var.dtype)
+        layer.add_parameter(name + '_g', g)
+        with dygraph.no_grad():
+            F.assign(w, v)
+            F.assign(g_var, g)
+        setattr(layer, name, fn.compute_weight(layer))
+
+        layer.register_forward_pre_hook(fn)
+        return fn
+
+    def remove(self, layer):
+        w_var = self.compute_weight(layer)
+        delattr(layer, self.name)
+        del layer._parameters[self.name + '_g']
+        del layer._parameters[self.name + '_v']
+        w = layer.create_parameter(w_var.shape, dtype=w_var.dtype)
+        layer.add_parameter(self.name, w)
+        with dygraph.no_grad():
+            F.assign(w_var, w)
+
+    def __call__(self, layer, inputs):
+        setattr(layer, self.name, self.compute_weight(layer))
+
+
+def weight_norm(layer, name='weight', dim=0):
+    """
+    This weight_norm layer applies weight normalization to a parameter according to the 
+    following formula:
+
+    .. math::
+
+        \mathbf{w} = g \dfrac{v}{\|v\|}
+
+    Weight normalization is a reparameterization of the weight vectors in a neural network that 
+    decouples the magnitude of those weight vectors from their direction. Weight normalization 
+    replaces the parameter specified by `name`(eg: 'weight') with two parameters: one parameter 
+    specifying the magnitude (eg: 'weight_g') and one parameter specifying the direction 
+    (eg: 'weight_v'). Weight normalization has been implemented as discussed in this paper: 
+    `Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks
+    <https://arxiv.org/pdf/1602.07868.pdf>`_.
+
+    Parameters:
+        layer(Layer): Layer of paddle, which has weight.
+        name(str, optional): Name of the weight parameter. Default: 'weight'.
+        dim(int, optional): Dimension over which to compute the norm. Dim is a non-negative number 
+              which is less than the rank of weight Tensor. For Example, dim can be chosen from 0, 
+              1, 2, 3 for convolution whose weight shape is [cout, cin, kh, kw] and rank is 4. 
+              If dim is set to None, meaning that all elements will be normalized. Default: 0.
+    
+    Returns:
+        Origin layer with weight norm hook.
+
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          from paddle.nn import Conv2D
+          from paddle.nn.utils import weight_norm
+
+          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+          paddle.disable_static()
+          conv = Conv2D(3, 5, 3)
+          wn = weight_norm(conv)
+          print(conv.weight_g.shape)
+          # [5]
+          print(conv.weight_v.shape)
+          # [5, 3, 3, 3]
+    """
+    WeightNorm.apply(layer, name, dim)
+    return layer
+
+
+def remove_weight_norm(layer, name='weight'):
+    """
+    remove weight normalization from layer.
+
+    Parameters:
+        layer(Layer): Layer of paddle, which has weight.
+        name(str, optional): Name of the weight parameter. Default: 'weight'.
+
+    Returns:
+        Origin layer without weight norm
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          from paddle.nn import Conv2D
+          from paddle.nn.utils import weight_norm, remove_weight_norm
+
+          paddle.disable_static()
+          conv = Conv2D(3, 5, 3)
+          wn = weight_norm(conv)
+          remove_weight_norm(conv)
+          print(conv.weight_g)
+          # AttributeError: 'Conv2D' object has no attribute 'weight_g'
+    """
+    for k, hook in layer._forward_pre_hooks.items():
+        if isinstance(hook, WeightNorm) and hook.name == name:
+            hook.remove(layer)
+            del layer._forward_pre_hooks[k]
+            return layer
+
+    raise ValueError("weight_norm of '{}' not found in {}".format(name, layer))
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index 4dc3cf397aea59f3fedfc86bff7a77556a6a63a7..49314c9832dd389411dffb3f498b34d09337a3f0 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -14,21 +14,32 @@
 
 __all__ = [
     'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam',
-    'Adamax', 'AdamaxOptimizer', 'AdamOptimizer', 'DecayedAdagrad',
-    'DecayedAdagradOptimizer', 'DGCMomentumOptimizer', 'Dpsgd',
-    'DpsgdOptimizer', 'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer',
-    'LambOptimizer', 'LarsMomentum', 'LarsMomentumOptimizer',
-    'LookaheadOptimizer', 'ModelAverage', 'Momentum', 'MomentumOptimizer',
-    'PipelineOptimizer', 'RecomputeOptimizer', 'RMSPropOptimizer', 'SGD',
-    'SGDOptimizer'
+    'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer',
+    'DGCMomentumOptimizer', 'Dpsgd', 'DpsgdOptimizer',
+    'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer', 'LambOptimizer',
+    'LarsMomentum', 'LarsMomentumOptimizer', 'LookaheadOptimizer',
+    'ModelAverage', 'Momentum', 'MomentumOptimizer', 'PipelineOptimizer',
+    'RecomputeOptimizer', 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer',
+    '_LRScheduler', 'NoamLR', 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR',
+    'PolynomialLR', 'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR',
+    'LambdaLR', 'ReduceLROnPlateau', 'CosineAnnealingLR'
 ]
 
 
-from ..fluid.optimizer import  SGD, Momentum, Adagrad, Adam, Adamax, Dpsgd, DecayedAdagrad, \
-            Ftrl, SGDOptimizer, MomentumOptimizer, AdagradOptimizer, \
-            AdamOptimizer, AdamaxOptimizer, DpsgdOptimizer, \
-            DecayedAdagradOptimizer, RMSPropOptimizer, FtrlOptimizer, Adadelta, \
-            AdadeltaOptimizer, ModelAverage, LarsMomentum, \
-            LarsMomentumOptimizer, DGCMomentumOptimizer, LambOptimizer, \
+from ..fluid.optimizer import  SGD, Momentum, Adagrad, Dpsgd, DecayedAdagrad, \
+            Ftrl, Adadelta, \
+            SGDOptimizer, MomentumOptimizer, AdagradOptimizer,DpsgdOptimizer,\
+            DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \
+            ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\
             ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \
-            RecomputeOptimizer
+            RecomputeOptimizer, LarsMomentumOptimizer
+
+from .optimizer import Optimizer
+from .adam import Adam
+from .adamw import AdamW
+from .adamax import Adamax
+from .rmsprop import RMSProp
+
+from . import lr_scheduler
+from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \
+            LinearLrWarmup, ExponentialLR, MultiStepLR, StepLR, LambdaLR, ReduceLROnPlateau, CosineAnnealingLR
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..0da8053fe8a3495f5d3188a737638531347de648
--- /dev/null
+++ b/python/paddle/optimizer/adam.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable
+
+__all__ = ["Adam"]
+
+
+class Adam(Optimizer):
+    """
+    The Adam optimizer uses an optimization described at the end
+    of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
+    it can dynamically adjusts the learning rate of each parameter using
+    the 1st moment estimates and the 2nd moment estimates of the gradient.
+    
+    The parameter ``param_out`` update rule with gradient ``grad``:
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
+
+        moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
+
+        learning\_rate & = learning\_rate * \\
+                          \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
+
+        param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+
+    Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
+
+    Args:
+        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LearningRateDecay. The default value is 0.001.
+        beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.9.
+        beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.999.
+        epsilon (float, optional): A small float value for numerical stability.
+            The default value is 1e-08.
+	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
+            The accumulators are updated at every step. Every element of the two moving-average
+            is updated in both dense mode and sparse mode. If the size of parameter is very large,
+            then the update may be very slow. The lazy mode only update the element that has
+            gradient in current mini-batch, so it will be much more faster. But this mode has
+            different semantics with the original Adam algorithm and may lead to different result.
+            The default value is False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=0.1,
+                    parameters=linear.parameters())
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+        .. code-block:: python
+
+            # Adam with beta1/beta2 as Tensor and weight_decay as float
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+            adam = paddle.optimizer.Adam(learning_rate=0.1,
+                    parameters=linear.parameters(),
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01)
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+    _moment1_acc_str = "moment1"
+    _moment2_acc_str = "moment2"
+    _beta1_pow_acc_str = "beta1_pow_acc"
+    _beta2_pow_acc_str = "beta2_pow_acc"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None,
+                 lazy_mode=False):
+        assert learning_rate is not None
+        assert beta1 is not None
+        assert beta2 is not None
+        assert epsilon is not None
+        super(Adam, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "adam"
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+        self._lazy_mode = lazy_mode
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        # Create accumulator tensors for first and second moments
+        for p in parameters:
+            self._add_accumulator(self._moment1_acc_str, p)
+            self._add_accumulator(self._moment2_acc_str, p)
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                fill_value=0.9 if isinstance(self._beta1, Variable) \
+                        else self._beta1,
+                shape=[1],
+                type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+            self._add_accumulator(
+                name=self._beta2_pow_acc_str,
+                param=p,
+                fill_value=0.999 if isinstance(self._beta2, Variable) \
+                        else self._beta2,
+                shape=[1],
+                type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment1 = self._get_accumulator(self._moment1_acc_str,
+                                        param_and_grad[0])
+        moment2 = self._get_accumulator(self._moment2_acc_str,
+                                        param_and_grad[0])
+        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                              param_and_grad[0])
+        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                              param_and_grad[0])
+        lr = self._create_param_lr(param_and_grad)
+        # create the adam optimize op
+
+        if framework.in_dygraph_mode():
+            _beta1 = self._beta1 if not isinstance(
+                self._beta1, Variable) else self._beta1.numpy().item(0)
+            _beta2 = self._beta2 if not isinstance(
+                self._beta2, Variable) else self._beta2.numpy().item(0)
+            _, _, _, _, _ = core.ops.adam(
+                param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
+                beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
+                moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
+                'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
+                1000, 'beta1', _beta1, 'beta2', _beta2)
+
+            return None
+
+        inputs = {
+            "Param": [param_and_grad[0]],
+            "Grad": [param_and_grad[1]],
+            "LearningRate": [lr],
+            "Moment1": [moment1],
+            "Moment2": [moment2],
+            "Beta1Pow": [beta1_pow_acc],
+            "Beta2Pow": [beta2_pow_acc]
+        }
+        outputs = {
+            "ParamOut": [param_and_grad[0]],
+            "Moment1Out": [moment1],
+            "Moment2Out": [moment2],
+            "Beta1PowOut": [beta1_pow_acc],
+            "Beta2PowOut": [beta2_pow_acc],
+        }
+        attrs = {
+            "epsilon": self._epsilon,
+            "lazy_mode": self._lazy_mode,
+            "min_row_size_to_use_multithread": 1000
+        }
+
+        if isinstance(self._beta1, Variable):
+            inputs['Beta1Tensor'] = self._beta1
+        else:
+            attrs['beta1'] = self._beta1
+        if isinstance(self._beta2, Variable):
+            inputs['Beta2Tensor'] = self._beta2
+        else:
+            attrs['beta2'] = self._beta2
+
+        adam_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True)
+
+        return adam_op
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
new file mode 100644
index 0000000000000000000000000000000000000000..73a78b17cbba55c1ee90a2708f6c163940158a51
--- /dev/null
+++ b/python/paddle/optimizer/adamax.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+
+__all__ = ["Adamax"]
+
+
+class Adamax(Optimizer):
+    """
+    The Adamax optimizer is implemented based on the Adamax Optimization 
+    in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
+    The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
+    which makes the learning rate update algorithm more stable and simple.
+
+    The parameter ``param_out`` update rule with gradient ``grad``:
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
+
+        inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
+
+        learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
+
+        param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
+
+    Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
+
+    The original paper does not have an ``epsilon`` attribute,
+    it is added here for numerical stability to prevent the division by 0 error.
+
+    Args:
+        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LearningRateDecay. The default value is 0.001.
+        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
+            The default value is 0.9.
+        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
+            The default value is 0.999.
+        epsilon (float, optional): A small float value for numerical stability.
+            The default value is 1e-08.
+	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+
+    **Notes**:
+        **Currently, Adamax doesn't support sparse parameter optimization.**
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+            adam = paddle.optimizer.Adamax(learning_rate=0.1,
+                    parameters=linear.parameters(),
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01)
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+    _moment_acc_str = "moment"
+    _inf_norm_acc_str = "inf_norm"
+    _beta1_pow_acc_str = "beta1_pow_acc"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        assert learning_rate is not None
+        assert beta1 is not None
+        assert beta2 is not None
+        assert epsilon is not None
+        super(Adamax, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "adamax"
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+
+    def _create_accumulators(self, block, parameters):
+        # Create accumulator tensors for first moment and infinity norm
+        for p in parameters:
+            self._add_accumulator(self._moment_acc_str, p)
+            self._add_accumulator(self._inf_norm_acc_str, p)
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                fill_value=self._beta1,
+                shape=[1])
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
+        inf_norm = self._get_accumulator(self._inf_norm_acc_str,
+                                         param_and_grad[0])
+        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                              param_and_grad[0])
+        # create the adamax optimize op
+        adamax_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "LearningRate": self._create_param_lr(param_and_grad),
+                "Moment": moment,
+                "InfNorm": inf_norm,
+                "Beta1Pow": beta1_pow_acc
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "MomentOut": moment,
+                "InfNormOut": inf_norm
+            },
+            attrs={
+                "beta1": self._beta1,
+                "beta2": self._beta2,
+                "epsilon": self._epsilon
+            },
+            stop_gradient=True)
+
+        return adamax_op
+
+    def _finish_update(self, block, parameters_and_grads):
+        """Update Beta1 Power accumulator
+        """
+        assert isinstance(block, framework.Block)
+        for param, grad in parameters_and_grads:
+            if grad is None or param.trainable is False:
+                continue
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope('adamax'):
+                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                                      param)
+                block.append_op(
+                    type="scale",
+                    inputs={"X": beta1_pow_acc},
+                    outputs={"Out": beta1_pow_acc},
+                    attrs={"scale": self._beta1},
+                    stop_gradient=True)
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..f498fcbffa24ec188b57ceb2d3c6884fc1e135d2
--- /dev/null
+++ b/python/paddle/optimizer/adamw.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from .adam import Adam
+from ..fluid import framework
+import paddle
+__all__ = ['AdamW']
+
+
+class DecoupledWeightDecay(object):
+    def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
+        if not isinstance(coeff, float) and \
+                not isinstance(coeff, framework.Variable):
+            raise TypeError("coeff should be float or Tensor.")
+        self._params_name = set()
+        self._apply_decay_param_fun = apply_decay_param_fun
+        self._coeff = coeff
+        super(DecoupledWeightDecay, self).__init__(**kwargs)
+
+    def _scale_parameters(self, params_and_grads):
+        """
+        Adds weight decay ops.
+            scaled_parameter = parameter * coeff
+
+        Args:
+            params_and_grads: A list of (parameters, gradients) pairs,
+                the parameters need to decay.
+        Raises:
+            Exception: The type of coeff and parameter is not consistent.
+        """
+        if isinstance(self._coeff, float) and self._coeff == 0.0:
+            return
+
+        scaled_params = []
+        for param, grad in params_and_grads:
+            # If no gradient then we don't need to do anything
+            if grad is None:
+                continue
+            if self._apply_decay_param_fun is not None \
+                    and not self._apply_decay_param_fun(param.name):
+                continue
+
+            if isinstance(self._coeff, float):
+                assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
+                    "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
+            else:
+                assert self._coeff.dtype == param.dtype, \
+                    "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
+
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                assert param.name not in self._params_name
+                scaled_params.append((param, grad, param * self._coeff))
+                self._params_name.add(param.name)
+        return scaled_params
+
+    def backward(self, **kargs):
+        return super(DecoupledWeightDecay, self).backward(**kargs)
+
+    def _apply_optimize(self, **kargs):
+        return super(DecoupledWeightDecay, self)._apply_optimize(**kargs)
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None):
+        params_grads = self.backward(
+            loss=loss,
+            startup_program=startup_program,
+            parameters=parameters,
+            no_grad_set=no_grad_set)
+        scaled_params = self._scale_parameters(params_grads)
+        for p_grad_sgrad in scaled_params:
+            param, grad, scaled_param = p_grad_sgrad
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                updated_param = paddle.fluid.layers.elementwise_sub(
+                    x=param, y=scaled_param)
+                paddle.fluid.layers.assign(input=updated_param, output=param)
+
+        optimize_ops = self._apply_optimize(
+            loss=loss,
+            params_grads=params_grads,
+            startup_program=startup_program)
+        return optimize_ops, params_grads
+
+    @framework.dygraph_only
+    def step(self):
+        parameter_list = self._parameter_list
+        self._dtype = None
+        params_grads = []
+        for param in self._parameter_list:
+            if not param.trainable:
+                continue
+            if param._grad_ivar() is not None:
+                grad_var = param._grad_ivar()
+                params_grads.append((param, grad_var))
+
+        scaled_params = self._scale_parameters(params_grads)
+        for p_grad_sgrad in scaled_params:
+            param, grad, scaled_param = p_grad_sgrad
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                updated_param = paddle.fluid.layers.elementwise_sub(
+                    x=param, y=scaled_param)
+                paddle.fluid.layers.assign(input=updated_param, output=param)
+        optimize_ops = self._apply_optimize(
+            loss=None, startup_program=None, params_grads=params_grads)
+
+    def __str__(self):
+        return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
+
+
+class AdamW(DecoupledWeightDecay, Adam):
+    """
+    The AdamW optimizer is implemented based on the AdamW Optimization 
+    in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
+    it can resolves the problem of L2 regularization failure in the Adam optimizer.
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
+        
+        moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
+
+        learning\_rate & = learning\_rate * \\
+            \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t}
+
+        param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
+
+
+    Args:
+        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LearningRateDecay. The default value is 0.001.
+	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+        beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.9.
+        beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.999.
+        epsilon (float, optional): A small float value for numerical stability.
+        weight_decay (float|Tensor): The weight decay coefficient, it can be float or Tensor. The default value is 0.0.
+            The default value is 1e-08.
+        apply_decay_param_fun (function|None): If it is not None,
+            only tensors that makes apply_decay_param_fun(Tensor)==True 
+            will be updated. It only works when we want to specify tensors.
+            Default: None.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
+            The accumulators are updated at every step. Every element of the two moving-average
+            is updated in both dense mode and sparse mode. If the size of parameter is very large,
+            then the update may be very slow. The lazy mode only update the element that has
+            gradient in current mini-batch, so it will be much more faster. But this mode has
+            different semantics with the original Adam algorithm and may lead to different result.
+            The default value is False.
+    **Notes**:
+        **Currently, AdamW doesn't support sparse parameter optimization.**
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+            adam = paddle.optimizer.AdamW(learning_rate=0.1,
+                    parameters=linear.parameters(),
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01)
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 parameters=None,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 weight_decay=0.0,
+                 apply_decay_param_fun=None,
+                 grad_clip=None,
+                 name=None,
+                 lazy_mode=False):
+        args_dict = {
+            "learning_rate": learning_rate,
+            "parameters": parameters,
+            "beta1": beta1,
+            "beta2": beta2,
+            "epsilon": epsilon,
+            "grad_clip": grad_clip,
+            "name": name,
+            "lazy_mode": lazy_mode
+        }
+        super(AdamW, self).__init__(
+            weight_decay,
+            apply_decay_param_fun=apply_decay_param_fun,
+            **args_dict)
diff --git a/python/paddle/optimizer/lr_scheduler.py b/python/paddle/optimizer/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ecaffb8fa509bdc54067bb25f8d1b5191b7ac1b
--- /dev/null
+++ b/python/paddle/optimizer/lr_scheduler.py
@@ -0,0 +1,1430 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy
+import warnings
+from paddle import Tensor
+
+__all__ = [
+    'NoamLR', 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
+    'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
+    'ReduceLROnPlateau', 'CosineAnnealingLR'
+]
+
+
+class _LRScheduler(object):
+    """LRScheduler Base class.
+
+    Define the common interface of an LRScheduler.
+    User can 'form paddle.optimizer.lr_scheduler import _LRScheduler'
+    And inherit from it to have a custom implementation of get_lr().
+    """
+
+    def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
+        if not isinstance(learning_rate, (float, int)):
+            raise TypeError(
+                "The type of learning rate must be float, but received {}".
+                format(type(learning_rate)))
+        self.base_lr = float(learning_rate)
+        self.last_lr = float(learning_rate)
+        self.last_epoch = last_epoch
+        self.verbose = verbose
+        self._var_name = None
+
+        self.step()
+
+    def __call__(self):
+        """ 
+        Return last computed learning rate on current epoch.
+        """
+        return self.last_lr
+
+    def step(self, epoch=None):
+        """
+        'step' should be called after 'minimize' . It will update the learning rate in optimizer according to 'epoch'.  
+        The new learning rate will take effect on next epoch.
+
+        Args:
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+
+        Returns:
+            None
+        
+        Examples:
+            Please refer to the example of current _LRScheduler. 
+        """
+        if epoch is None:
+            self.last_epoch += 1
+            self.last_lr = self.get_lr()
+        else:
+            self.last_epoch = epoch
+            if hasattr(self, "_get_closed_form_lr"):
+                self.last_lr = self._get_closed_form_lr()
+            else:
+                self.last_lr = self.get_lr()
+
+        if self.verbose:
+            print('Epoch {}: {} set learning rate to {}.'.format(
+                self.last_epoch, self.__class__.__name__, self.last_lr))
+
+    def state_dict(self):
+        """
+        Returns the state of the scheduler as a :class:`dict`.
+
+        It is a subset of self.__dict__ .
+        """
+        self._state_keys()
+        state_dict = {}
+        for key in self.keys:
+            if key not in self.__dict__:
+                continue
+            value = self.__dict__[key]
+            if isinstance(value, Tensor):
+                assert value.shape == [
+                    1
+                ], "shape of Tensor in state_dict must be [1] {}".format(
+                    value.shape)
+                value = value.numpy()[0]
+            state_dict[key] = value
+
+        return state_dict
+
+    # For those subclass who overload _LRScheduler, "last_epoch, last_lr" will be saved by default.
+    # (Note): you can change it for your subclass.
+    def _state_keys(self):
+        """
+        set the keys in self.__dict__ that are needed to be saved.
+        """
+        self.keys = ['last_epoch', 'last_lr']
+
+    def set_dict(self, state_dict):
+        """
+        Loads the schedulers state.
+        """
+        self._state_keys()
+        for key in self.keys:
+            if key in state_dict:
+                self.__dict__[key] = state_dict[key]
+            else:
+                raise RuntimeError(
+                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".
+                    format(key))
+        if len(state_dict) > len(self.keys):
+            warnings.warn(
+                "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
+            )
+
+    # alias for set_dict
+    set_state_dict = set_dict
+
+    def get_lr(self):
+        # calculate by python float
+        raise NotImplementedError
+
+
+class NoamLR(_LRScheduler):
+    """
+
+    Applies Noam Lear to the initial learning rate. 
+
+    The algorithm can be described as following.
+
+    .. math::
+
+        new\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(epoch^{-0.5}, epoch * warmup\_steps^{-1.5})
+
+    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_ 
+
+
+    Args:
+        d$_{model}$(int): The dimensionality of input and output feature vector of model. It is a python int number.
+        warmup_steps(int): The number of warmup steps. A super parameter. It is a python int number
+        learning_rate (float): The initial learning rate. It is a python float number. Default: 1.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``NoamLR`` instance to schedule learning rate.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+
+    """
+
+    def __init__(self,
+                 d_model,
+                 warmup_steps,
+                 learning_rate=1.0,
+                 last_epoch=-1,
+                 verbose=False):
+        self.d_model = d_model
+        self.warmup_steps = warmup_steps
+        super(NoamLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        if self.last_epoch == 0:
+            a = 1
+        else:
+            a = self.last_epoch**-0.5
+        b = self.warmup_steps**-1.5 * self.last_epoch
+        return self.base_lr * (self.d_model**-0.5) * min(a, b)
+
+
+class PiecewiseLR(_LRScheduler):
+    """
+
+    Piecewise learning rate scheduler.
+
+    The algorithm can be described as the code below:
+
+    .. code-block:: text
+
+        boundaries = [100, 200]
+        values = [1.0, 0.5, 0.1]
+        if epoch < 100:
+            learning_rate = 1.0
+        elif 100 <= global_step < 200:
+            learning_rate = 0.5
+        else:
+            learning_rate = 0.1
+
+    Args:
+        boundaries(list): A list of steps numbers. The type of element in the list is python int. 
+        values(list): A list of learning rate values that will be picked during different epoch boundaries. 
+            The type of element in the list is python float.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``PiecewiseLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
+        self.boundaries = boundaries
+        self.values = values
+        super(PiecewiseLR, self).__init__(
+            last_epoch=last_epoch, verbose=verbose)
+
+    def get_lr(self):
+
+        for i in range(len(self.boundaries)):
+            if self.last_epoch < self.boundaries[i]:
+                return self.values[i]
+        return self.values[len(self.values) - 1]
+
+
+class NaturalExpLR(_LRScheduler):
+    """
+
+    Applies natural exponential decay to the initial learning rate.
+    
+    The algorithm can be described as following:
+
+    .. math::
+
+        new\_learning\_rate = learning\_rate * e^{- gama * epoch}
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        gamma (float, optional): A Ratio to update the learning rate. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``NaturalExpLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        self.gamma = gamma
+        super(NaturalExpLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)
+
+
+class InverseTimeLR(_LRScheduler):
+    """
+
+    Applies inverse time decay to the initial learning rate.
+
+    The algorithm can be described as following:
+
+    .. math::
+
+        new\_learning\_rate = \\frac{learning\_rate}{1 + gamma * epoch}
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+            It should be less than 1.0. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``InverseTimeLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+
+    """
+
+    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        self.gamma = gamma
+        super(InverseTimeLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr / (1 + self.gamma * self.last_epoch)
+
+
+class PolynomialLR(_LRScheduler):
+    """
+
+    Applies polynomial decay to the initial learning rate.
+
+    The algorithm can be described as following.
+
+    If cycle is set to True, then:
+
+    .. math::
+
+        decay\_steps & = decay\_steps * math.ceil(\\frac{epoch}{decay\_steps}) 
+
+        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\\frac{epoch}{decay\_steps})^{power}+end\_lr
+
+    If cycle is set to False, then:
+
+    .. math::
+
+        epoch & = min(epoch, decay\_steps) 
+
+        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\\frac{epoch}{decay\_steps})^{power}+end\_lr
+
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        decay_steps(int): The decay step size. It determines the decay cycle.
+        end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
+        power(float, optional): Power of polynomial. Default: 1.0.
+        cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease 
+            to ``end_lr`` .  If False, the learning rate is monotone decreasing. Default: False.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``PolynomialLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 decay_steps,
+                 end_lr=0.0001,
+                 power=1.0,
+                 cycle=False,
+                 last_epoch=-1,
+                 verbose=False):
+        self.decay_steps = decay_steps
+        self.end_lr = end_lr
+        self.power = power
+        self.cycle = cycle
+        super(PolynomialLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        tmp_epoch_num = self.last_epoch
+        tmp_decay_steps = self.decay_steps
+        if self.cycle:
+            div_res = math.ceil(
+                float(self.last_epoch) / float(self.decay_steps))
+
+            if self.last_epoch == 0:
+                div_res = 1
+            tmp_decay_steps = self.decay_steps * div_res
+        else:
+            tmp_epoch_num = min(self.last_epoch, self.decay_steps)
+
+        return (self.base_lr - self.end_lr) * (
+            (1 - float(tmp_epoch_num) / float(tmp_decay_steps)
+             )**self.power) + self.end_lr
+
+
+class LinearLrWarmup(_LRScheduler):
+    """
+
+    Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
+    For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
+    
+    When epoch < warmup_steps, learning rate is updated as:
+    
+    .. code-block:: text
+    
+            lr = start_lr + (end_lr - start_lr) * (epoch / warmup_steps)
+    
+    where start_lr is the initial learning rate, and end_lr is the final learning rate;
+    
+    When epoch >= warmup_steps, learning rate is updated as:
+    
+    .. code-block:: text
+    
+            lr = learning_rate
+    
+    where lr is float or any subclass of ``_LRScheduler`` .
+
+    Args:
+        learning_rate (float|_LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``_LRScheduler`` .
+        warmup_steps (int): total steps of warm up.
+        start_lr (float): Initial learning rate of warm up.
+        end_lr (float): Final learning rate of warm up.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``LinearLrWarmup`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.LinearLrWarmup(
+                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.LinearLrWarmup(
+                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()      
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 warmup_steps,
+                 start_lr,
+                 end_lr,
+                 last_epoch=-1,
+                 verbose=False):
+        type_check = isinstance(learning_rate, float) or isinstance(
+            learning_rate, int) or isinstance(learning_rate, _LRScheduler)
+        if not type_check:
+            raise TypeError(
+                "the type of learning_rate should be [int, float or _LRScheduler], the current type is {}".
+                format(learning_rate))
+        self.learning_rate = learning_rate
+        self.warmup_steps = warmup_steps
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
+            end_lr, start_lr)
+        super(LinearLrWarmup, self).__init__(start_lr, last_epoch, verbose)
+
+    def get_lr(self):
+        if self.last_epoch < self.warmup_steps:
+            return (self.end_lr - self.start_lr) * float(
+                self.last_epoch) / float(self.warmup_steps) + self.start_lr
+        else:
+            if isinstance(self.learning_rate, _LRScheduler):
+                self.learning_rate.step()
+                return self.learning_rate()
+
+            return self.learning_rate
+
+
+class ExponentialLR(_LRScheduler):
+    """
+
+    Update learning rate by 'gamma' each epoch.
+
+    The algorithm can be described as following.
+    
+    .. math::
+
+        new\_learning\_rate = last\_learning\_rate * gamma
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+            It should be less than 1.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``ExponentialLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        self.gamma = gamma
+        super(ExponentialLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr * (self.gamma**self.last_epoch)
+
+
+class MultiStepLR(_LRScheduler):
+    """
+    Update the learning rate by ``gama`` once ``epoch`` reaches one of the milestones.
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5
+        milestones = [30, 50]
+        gamma = 0.1
+        if epoch < 30:
+            learning_rate = 0.5
+        elif epoch < 50:
+            learning_rate = 0.05
+        else:
+            learning_rate = 0.005
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+            It should be less than 1.0. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        
+
+    Returns:
+        ``MultiStepLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 milestones,
+                 gamma=0.1,
+                 last_epoch=-1,
+                 verbose=False):
+        if not isinstance(milestones, (tuple, list)):
+            raise TypeError(
+                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
+                % type(milestones))
+
+        if not all([
+                milestones[i] < milestones[i + 1]
+                for i in range(len(milestones) - 1)
+        ]):
+            raise ValueError('The elements of milestones must be incremented')
+        if gamma >= 1.0:
+            raise ValueError('gamma should be < 1.0.')
+
+        self.milestones = milestones
+        self.gamma = gamma
+        super(MultiStepLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        for i in range(len(self.milestones)):
+            if self.last_epoch < self.milestones[i]:
+                return self.base_lr * (self.gamma**i)
+        return self.base_lr * (self.gamma**len(self.milestones))
+
+
+class StepLR(_LRScheduler):
+    """
+    Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5
+        step_size = 30
+        gamma = 0.1
+
+        learning_rate = 0.5     if epoch < 30
+        learning_rate = 0.05    if 30 <= epoch < 60
+        learning_rate = 0.005   if 60 <= epoch < 90
+        ...
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        step_size (int): the interval to update.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+            It should be less than 1.0. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``StepLR`` instance to schedule learning rate.
+
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 step_size,
+                 gamma=0.1,
+                 last_epoch=-1,
+                 verbose=False):
+        if not isinstance(step_size, int):
+            raise TypeError(
+                "The type of 'step_size' must be 'int', but received %s." %
+                type(step_size))
+        if gamma >= 1.0:
+            raise ValueError('gamma should be < 1.0.')
+
+        self.step_size = step_size
+        self.gamma = gamma
+        super(StepLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        i = self.last_epoch // self.step_size
+        return self.base_lr * (self.gamma**i)
+
+
+class LambdaLR(_LRScheduler):
+    """
+    Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5        # init learning_rate
+        lr_lambda = lambda epoch: 0.95 ** epoch
+
+        learning_rate = 0.5        # epoch 0
+        learning_rate = 0.475      # epoch 1
+        learning_rate = 0.45125    # epoch 2
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the initial learning rate by this factor.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+    
+    Returns:
+        ``LambdaLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+
+    """
+
+    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
+        if not callable(lr_lambda):
+            raise TypeError(
+                "The type of 'lr_lambda' in 'LambdaLR' must be 'function', but received %s."
+                % type(lr_lambda))
+
+        self.lr_lambda = lr_lambda
+        super(LambdaLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr * self.lr_lambda(self.last_epoch)
+
+
+class ReduceLROnPlateau(_LRScheduler):
+    """
+    Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate 
+    by 2 to 10 times once model performance has no longer improvement.
+
+    The ``metrics`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``metrics`` 
+    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * factor`` . 
+    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``metrics`` stop ascending for a ``patience`` 
+    number of epochs, the learning rate will be reduced.)
+
+    In addition, After each reduction, it will wait a ``cooldown`` number of epochs before resuming above operation.
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the 
+            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning 
+            rate will reduce when ``loss`` stops ascending. Default: ``'min'`` .
+        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` . 
+            It should be less than 1.0. Default: 0.1.
+        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced. 
+            Default: 10.
+        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` . 
+            This make tiny changes of ``loss`` will be ignored. Default: 1e-4.
+        threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
+            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum 
+            change of ``loss`` is ``threshold`` . Default: ``'rel'`` .
+        cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0.
+        min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0.
+        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon, 
+            the update is ignored. Default: 1e-8.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``.
+
+    
+    Returns:
+        ``ReduceLROnPlateau`` instance to schedule learning rate.
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step(loss)
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step(out[0])
+
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 mode='min',
+                 factor=0.1,
+                 patience=10,
+                 threshold=1e-4,
+                 threshold_mode='rel',
+                 cooldown=0,
+                 min_lr=0,
+                 epsilon=1e-8,
+                 verbose=False):
+        mode = mode.lower()
+        if mode not in ['min', 'max']:
+            raise ValueError('mode: ' + mode + ' is unknown!')
+        self.mode = mode
+
+        if factor >= 1.0:
+            raise ValueError(
+                'new_lr = origin_lr * gamma and gamma should be < 1.0.')
+        self.factor = factor
+
+        threshold_mode = threshold_mode.lower()
+        if threshold_mode not in ['rel', 'abs']:
+            raise ValueError('threshold mode: ' + threshold_mode +
+                             ' is unknown!')
+        self.threshold_mode = threshold_mode
+        if not isinstance(learning_rate, (float, int)):
+            raise TypeError(
+                "The type of 'learning_rate' in 'ReduceLROnPlateau' must be 'float', but received %s."
+                % type(learning_rate))
+
+        self.verbose = verbose
+        self.patience = patience
+        self.threshold = threshold
+        self.threshold_mode = threshold_mode
+        self.cooldown = cooldown
+        self.min_lr = min_lr
+        self.epsilon = epsilon
+
+        self.cooldown_counter = 0
+        self.best = None
+        self.num_bad_epochs = 0
+
+        # Can not call Parent __init__, so implement here.
+        self.base_lr = float(learning_rate)
+        self.last_lr = float(learning_rate)
+        self.last_epoch = 0
+        self.verbose = verbose
+        self._var_name = None
+
+    # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
+    def _state_keys(self):
+        self.keys = [
+            'cooldown_counter', 'best', 'num_bad_epochs', 'last_epoch',
+            'last_lr'
+        ]
+
+    def step(self, metrics, epoch=None):
+        """
+        step should be called after 'minimize' . It will update the learning rate in optimizer according to ``metrics`` .  
+        The new learning rate will take effect on next epoch.
+
+        Args:
+            metrics (Tensor|numpy.ndarray|float): Which will be monitored to determine whether the learning rate will reduce. 
+                If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. If it's 'Tensor' or
+                'numpy.ndarray', its shape must be [1].
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+
+        Returns:
+            None
+        
+        Examples:
+            Please refer to the example of current _LRScheduler.
+        """
+        if epoch is None:
+            self.last_epoch = self.last_epoch + 1
+        else:
+            self.last_epoch = epoch
+
+        # loss must be 1-D Tensor with shape [1]
+        if isinstance(metrics, (Tensor, numpy.ndarray)):
+            assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
+                "should be (1L,), but the current metrics.shape is {}. Maybe that "  \
+                "you should call paddle.mean to process it first.".format(loss.shape)
+        elif not isinstance(metrics,
+                            (int, float, numpy.float32, numpy.float64)):
+            raise TypeError(
+                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}".
+                format(type(metrics)))
+
+        if self.cooldown_counter > 0:
+            self.cooldown_counter -= 1
+        else:
+            if self.best is None or self._is_better(metrics, self.best):
+                self.best = metrics
+                self.num_bad_epochs = 0
+            else:
+                self.num_bad_epochs += 1
+
+            if self.num_bad_epochs > self.patience:
+                self.cooldown_counter = self.cooldown
+                self.num_bad_epochs = 0
+                new_lr = max(self.last_lr * self.factor, self.min_lr)
+                if self.last_lr - new_lr > self.epsilon:
+                    self.last_lr = new_lr
+                    if self.verbose:
+                        print('Epoch {}: {} set learning rate to {}.'.format(
+                            self.last_epoch, self.__class__.__name__,
+                            self.last_lr))
+
+    def _is_better(self, current, best):
+        print("mode", self.mode, 'threshold_mode', self.threshold_mode)
+        if self.mode == 'min' and self.threshold_mode == 'rel':
+            return current < best - best * self.threshold
+
+        elif self.mode == 'min' and self.threshold_mode == 'abs':
+            return current < best - self.threshold
+
+        elif self.mode == 'max' and self.threshold_mode == 'rel':
+            return current > best + best * self.threshold
+
+        else:
+            return current > best + self.threshold
+
+
+class CosineAnnealingLR(_LRScheduler):
+    """
+
+    Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to 
+    the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in 
+    SGDR:
+
+        \begin{aligned}
+            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+            & T_{cur} \neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+            & T_{cur} = (2k+1)T_{max}.
+        \end{aligned}
+
+    The algorithm can be described as following.
+
+    .. math::
+        \begin{aligned}
+            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+            & T_{cur} \neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+            & T_{cur} = (2k+1)T_{max}.
+        \end{aligned}
+    
+    It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_. 
+    Note that this only implements the cosine annealing part of SGDR, and not the restarts.
+    
+    Args:
+        learning_rate (float): The initial learning rate, that is :math:`\eta_{max}` . It can be set to python float or int number.
+        T_max (int): Maximum number of iterations. It is half of the decay cycle of learning rate.
+        eta_min (float|int, optional): Minimum learning rate, that is :math:`\eta_{min}` . Default: 0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``CosineAnnealingLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 T_max,
+                 eta_min=0,
+                 last_epoch=-1,
+                 verbose=False):
+        if not isinstance(T_max, int):
+            raise TypeError(
+                "The type of 'T_max' in 'CosineAnnealingLR' must be 'int', but received %s."
+                % type(T_max))
+        if not isinstance(eta_min, (float, int)):
+            raise TypeError(
+                "The type of 'eta_min' in 'CosineAnnealingLR' must be 'float, int', but received %s."
+                % type(eta_min))
+        self.T_max = T_max
+        self.eta_min = float(eta_min)
+        super(CosineAnnealingLR, self).__init__(learning_rate, last_epoch,
+                                                verbose)
+
+    def get_lr(self):
+        if self.last_epoch == 0:
+            return self.base_lr
+        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
+            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(
+                math.pi / self.T_max)) / 2
+
+        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
+            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
+                self.last_lr - self.eta_min) + self.eta_min
+
+    def _get_closed_form_lr(self):
+        return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos(
+            math.pi * self.last_epoch / self.T_max)) / 2
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f9de0cefc05d1aaee36fa3af5cfa9ae4affcb97
--- /dev/null
+++ b/python/paddle/optimizer/optimizer.py
@@ -0,0 +1,921 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import six
+import logging
+from collections import defaultdict
+
+from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
+from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
+import paddle
+
+from ..fluid import framework
+from ..fluid import layers
+from ..fluid import unique_name
+from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
+from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops
+from ..fluid.framework import program_guard
+from ..fluid.initializer import Constant
+from ..fluid.layer_helper import LayerHelper
+from ..fluid.layers import ops
+from ..fluid.regularizer import append_regularization_ops
+from ..fluid.dygraph import base as imperative_base
+from ..fluid.dygraph import no_grad
+from paddle.fluid import core
+from paddle.fluid.layers import tensor
+from functools import reduce
+from ..fluid.wrapped_decorator import signature_safe_contextmanager
+from .. import compat as cpt
+from .lr_scheduler import _LRScheduler
+
+__all__ = ['Optimizer']
+
+
+class Optimizer(object):
+    """Optimizer Base class.
+
+    Define the common interface of an optimizer.
+    User should not use this class directly,
+    but need to use one of it's implementation.
+
+    Args:
+        learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``.
+            It can be a float value or any subclass of ``_LRScheduler`` .
+        parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of \
+            some derived class of ``GradientClipBase`` . There are three cliping strategies \
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , \
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+
+    Returns:
+       Base class for optimizer. 
+    
+    Examples:
+        .. code-block:: python
+
+            #Take the subclass adam as an example
+            #Optimizer 
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=0.1,
+                    parameters=linear.parameters())
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+
+    @imperative_base.no_grad()
+    def __init__(self,
+                 learning_rate,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        self._parameter_list = list(
+            parameters) if parameters is not None else None
+        self._name = name
+        if framework.in_dygraph_mode():
+            if self._parameter_list is None:
+                raise AttributeError(
+                    "parameters argument given to the Optimizer should not be None in dygraph mode."
+                )
+            if weight_decay is not None:
+                for param in self._parameter_list:
+                    if param.regularizer is not None:
+                        logging.info(
+                            "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
+                            "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
+                            % weight_decay.__str__())
+                        break
+        if not isinstance(learning_rate, (float, _LRScheduler)):
+            raise TypeError(
+                "learning rate should be float or _LRScheduler, got %s here" %
+                type(learning_rate))
+        if grad_clip is not None:
+            if not isinstance(grad_clip, GradientClipBase):
+                raise TypeError(
+                    "'grad_clip' should be an instance of GradientClipBase's derived class"
+                )
+        if isinstance(weight_decay, float):
+            from ..fluid.regularizer import L2Decay
+            self.regularization = L2Decay(weight_decay)
+        else:
+            self.regularization = weight_decay
+        self._grad_clip = grad_clip
+        self._learning_rate = learning_rate
+        # the learning rate type should be inferenced from loss
+        self._dtype = None
+        # each program should have a independent learning rate
+        # program -> tensor(learning_rate)
+        self._learning_rate_map = dict()
+        # Dictionary of accumulators. Some optimizer subclasses need to
+        # allocate and manage extra tensors associated with the parameters
+        # to train. These tensors are called accumulators.
+        # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
+        self._accumulators = defaultdict(lambda: dict())
+        self.helper = None
+        self._opti_name_list = []
+        self._accumulators_holder = {}
+        self._param_device_map = dict()
+        self.clear_gradients = self.clear_grad
+
+    @framework.dygraph_only
+    def state_dict(self):
+        '''
+        Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be include in state dict.
+        If the optimizer never be called(minimize function), the state_dict is empty.
+
+        Args: 
+            None
+
+        Returns:
+            state_dict(dict) : dict contains all the Tensor used by optimizer
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                paddle.disable_static()
+                emb = paddle.nn.Embedding([10, 10])
+
+                adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
+                state_dict = adam.state_dict()
+
+        '''
+        state_dict = {}
+        for k, v in self._accumulators.items():
+            for para_name, var_tmp in v.items():
+                state_dict[var_tmp.name] = var_tmp
+        # global step if use lr decay
+        if isinstance(self._learning_rate, _LRScheduler):
+            state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
+        return state_dict
+
+    @framework.dygraph_only
+    def set_state_dict(self, state_dict):
+        '''
+        Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be changed.
+
+        Args: 
+            state_dict(dict) : Dict contains all the Tensor needed by optimizer
+        Return:
+            None
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                paddle.disable_static()
+                emb = paddle.nn.Embedding([10, 10])
+
+                state_dict = emb.state_dict()
+                paddle.framework.save(state_dict, "paddle_dy")
+
+                adam = paddle.optimizer.Adam(learning_rate=paddle.optimizer.NoamLR( 100, 10000), 
+                                            parameters=emb.parameters())
+                state_dict = adam.state_dict()
+                paddle.framework.save(state_dict, "paddle_dy")
+
+                para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
+
+                adam.set_state_dict(opti_state_dict)
+
+        '''
+
+        if isinstance(self._learning_rate, _LRScheduler):
+            self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
+
+        self._accumulators_holder = state_dict
+        for k, v in self._accumulators.items():
+            for para_name, var_tmp in v.items():
+                assert var_tmp.name in state_dict, \
+                        "optimizer Tensor {} not found".format( var_tmp.name )
+                var = var_tmp.value()
+                tensor = var.get_tensor()
+                model_np = np.array(tensor)
+
+                load_para = state_dict[var_tmp.name]
+
+                if isinstance(load_para, Variable):
+                    load_para_np = load_para.numpy()
+                elif isinstance(load_para, core.VarBase):
+                    load_para_np = load_para.numpy()
+                elif isinstance(load_para, np.ndarray):
+                    load_para_np = load_para
+                else:
+                    raise RuntimeError("State dict type {} not supprt".format(
+                        str(type(load_para))))
+
+                assert model_np.shape == load_para_np.shape,  \
+                                          "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
+                                                 item.name, model_np.shape, load_para_np.shape)
+
+                assert model_np.dtype == load_para_np.dtype, \
+                                          "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
+                                                item.name, model_np.dtype, load_para_np.dtype)
+
+                tensor.set(load_para_np, framework._current_expected_place())
+
+    def get_opti_var_name_list(self):
+        return self._opti_name_list
+
+    def _create_global_learning_rate(self):
+        if isinstance(self._learning_rate, _LRScheduler):
+            lr_var = self._global_learning_rate()
+            # only create global lr_var once
+            if not isinstance(lr_var, framework.Variable):
+                lr_name = unique_name.generate('learning_rate')
+                self._learning_rate._var_name = lr_name
+                lr_var = self.helper.create_global_variable(
+                    name=lr_name,
+                    shape=[1],
+                    persistable=True,
+                    stop_gradient=True,
+                    dtype=paddle.get_default_dtype()
+                    if self._dtype is None else self._dtype)
+                main_prog = framework.default_main_program()
+                main_prog.lr_sheduler = self._learning_rate
+                main_prog.lr_var = lr_var
+                self._learning_rate_map[framework.default_main_program(
+                )] = lr_var
+
+            lr_value = float(self._learning_rate())
+            self.helper.set_variable_initializer(
+                lr_var, initializer=Constant(value=lr_value))
+        elif isinstance(self._learning_rate, float):
+            # only create global lr_var once
+            lr = self._global_learning_rate()
+            if isinstance(lr, framework.Variable):
+                return
+            else:
+                self._learning_rate_map[framework.default_main_program(
+                )] = layers.create_global_var(
+                    name=unique_name.generate("learning_rate"),
+                    shape=[1],
+                    value=float(self._learning_rate),
+                    dtype=paddle.get_default_dtype()
+                    if self._dtype is None else self._dtype,
+                    persistable=True)
+
+    @framework.dygraph_only
+    def set_lr(self, value):
+        """
+        :api_attr: imperative
+        
+        Set the value of the learning rate manually in the optimizer. If the optimizer use _LRScheduler,
+        this API cannot be invoked, because it will lead to conflict.
+
+        Args:
+            value (float|Tensor): the value of learning rate
+
+        Returns:
+            None
+          
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                paddle.disable_static()
+                linear = paddle.nn.Linear(10, 10)
+
+                adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
+
+                # set learning rate manually by python float value
+                lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+                for i in range(5):
+                    adam.set_lr(lr_list[i])
+                    lr = adam.get_lr()
+                    print("current lr is {}".format(lr))
+                # Print:
+                #    current lr is 0.2
+                #    current lr is 0.3
+                #    current lr is 0.4
+                #    current lr is 0.5
+                #    current lr is 0.6
+
+        """
+        if not isinstance(value, (int, float)):
+            raise TypeError(
+                "The type of 'value' in optimizer.set_lr must be float, but received %s."
+                % (type(value)))
+        if isinstance(self._learning_rate, _LRScheduler):
+            raise RuntimeError(
+                "optimizer's learning rate can't be _LRScheduler when invoke this API, because this will lead to conflict."
+            )
+        self._learning_rate = float(value)
+        current_lr = self._global_learning_rate()
+        if current_lr is not None:
+            global_block = framework.default_main_program().global_block()
+            global_block.append_op(
+                type='fill_constant',
+                outputs={'Out': [current_lr]},
+                attrs={
+                    'dtype': current_lr.dtype,
+                    'shape': list(current_lr.shape),
+                    'value': float(value)
+                },
+                stop_gradient=True)
+
+    @framework.dygraph_only
+    def get_lr(self):
+        """
+        :api_attr: imperative
+        
+        Get current step learning rate. The return value is all the same When _LRScheduler is not used,
+        otherwise return the current step learning rate.
+
+        Returns:
+            float: The learning rate of the current step.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                # example1: _LRScheduler is not used, return value is all the same
+                paddle.disable_static()
+                emb = paddle.nn.Embedding([10, 10])
+                adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters())
+                lr = adam.get_lr()
+                print(lr) # 0.001
+
+                # example2: PiecewiseLR is used, return the step learning rate
+                paddle.disable_static()
+                inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+                linear = paddle.nn.Linear(10, 10)
+                inp = paddle.to_tensor(inp)
+                out = linear(inp)
+                loss = paddle.reduce_mean(out)
+                
+                bd = [2, 4, 6, 8]
+                value = [0.2, 0.4, 0.6, 0.8, 1.0]
+                scheduler = paddle.optimizer.PiecewiseLR(bd, value, 0)
+                adam = paddle.optimizer.Adam(scheduler,
+                                       parameters=linear.parameters())
+
+                # first step: learning rate is 0.2
+                np.allclose(adam.get_lr(), 0.2, rtol=1e-06, atol=0.0) # True
+
+                # learning rate for different steps
+                ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
+                for i in range(12):
+                    adam.step()
+                    lr = adam.get_lr()
+                    scheduler.step()
+                    np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True
+
+        """
+        if isinstance(self._learning_rate, float):
+            return self._learning_rate
+        else:
+            return self._learning_rate()
+
+    def _global_learning_rate(self, program=None):
+        """
+        get global decayed learning rate
+        :return:
+        """
+        if program is None:
+            program = framework.default_main_program()
+        return self._learning_rate_map.get(program, None)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        """ append optimize operator to block and return all the added optimize_op
+        """
+        raise NotImplementedError(
+            "Class \"Optimizer\" connot be used directly as an optimizer, please use its subclasses such as \"Adam\""
+        )
+
+    def _create_param_lr(self, param_and_grad):
+        # create learning rate tensor for every parameter
+        param = param_and_grad[0]
+        param_lr = param.optimize_attr['learning_rate']
+        if type(param_lr) == Variable:
+            return param_lr
+        else:
+            if param_lr == 1.0:
+                return self._global_learning_rate()
+            else:
+                with default_main_program()._lr_schedule_guard(
+                        is_with_opt=True), framework.name_scope(
+                            'scale_with_param_lr'):
+                    return self._global_learning_rate() * param_lr
+
+    def _create_accumulators(self, block, parameters):
+        """Create all accumulators needed by the parameters
+
+        Args:
+            block: the block in which the loss tensor is present
+            parameters: list of parameter tensors for the optimizer
+        """
+        pass
+
+    def _finish_update(self, block, parameters_and_grads):
+        """Finish any custom updates needed
+           before completing an optimization step
+
+        Args:
+            block: the block in which the loss tensor is present
+            parameters: list of parameter tensors for the optimizer
+
+        Returns:
+            None
+        """
+        pass
+
+    def _add_accumulator(self,
+                         name,
+                         param,
+                         dtype=None,
+                         fill_value=0.0,
+                         shape=None,
+                         type=None,
+                         device=None):
+        """Utility function to add an accumulator for a parameter
+
+        Args:
+            block: the block in which the loss tensor is present
+            name: name of the accumulator
+            param: parameter tensor for which accumulator is to be added
+            dtype: data type of the accumulator tensor
+            fill_value: value to initialize the accumulator tensor
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        if (name in self._accumulators and
+                param.name in self._accumulators[name]):
+            if framework.in_dygraph_mode():
+                return self._accumulators[name][param.name]
+            raise Exception("Accumulator {} already exists for parameter {}".
+                            format(name, param.name))
+        if shape == None:
+            shape = param.shape
+        assert isinstance(self.helper, LayerHelper)
+
+        var_name = param.name + "_" + name
+        var_name = unique_name.generate(var_name)
+        self._opti_name_list.append(var_name)
+
+        var = self.helper.create_global_variable(
+            name=var_name,
+            persistable=True,
+            dtype=dtype or param.dtype,
+            type=param.type if type is None else type,
+            shape=shape,
+            belong_to_optimizer=True)
+        if device is None:
+            device = self._get_device_for_param(param.name)
+        with device_guard(device):
+            self.helper.set_variable_initializer(
+                var, initializer=Constant(value=float(fill_value)))
+
+        if framework.in_dygraph_mode():
+            if len(self._accumulators_holder) > 0:
+                assert var_name in self._accumulators_holder, \
+                        "Optimizer set error, {} should in state dict".format( var_name )
+                var.set_value(self._accumulators_holder[var_name])
+
+        self._accumulators[name][param.name] = var
+        return var
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+
+        Args:
+            name: name of the accumulator
+            param: parameter tensor for which accumulator is to be fetched
+
+        Returns:
+            accumulator tensor for the parameter
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        if (name not in self._accumulators or
+                param.name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, param.name))
+        return self._accumulators[name][param.name]
+
+    def _update_param_device_map(self, parameters_and_grads, target_block):
+        for param_and_grad in parameters_and_grads:
+            if param_and_grad[0].trainable is True:
+                param_name = param_and_grad[0].name
+                ops = target_block.ops
+                device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName(
+                )
+                for op in ops:
+                    input_arg_names = op.input_arg_names
+                    if param_name in input_arg_names:
+                        self._param_device_map[param_name] = op.attr(
+                            device_attr_name)
+                        break
+
+    def _get_device_for_param(self, param_name):
+        device = None
+        if param_name in self._param_device_map:
+            device = self._param_device_map[param_name]
+        return device
+
+    def _create_optimization_pass(self, parameters_and_grads):
+        """Add optimization operators to update gradients to tensors.
+
+        Args:
+          parameters_and_grads(list(tuple(Tensor, Tensor))):
+            a list of (tensor, gradient) pair to update.
+
+        Returns:
+          return_op_list: a list of operators that will complete one step of
+            optimization. This will include parameter update ops, global step
+            update ops and any other custom ops required by subclasses to manage
+            their internal state.
+        """
+        # This is a default implementation of create_optimization_pass that
+        # can be shared by most optimizers. This implementation assumes that
+        # the subclass will implement the _append_optimize_op method and the
+        #  _initialize_tensors method. The subclass can extend the
+        # _create_accumulators method if it needs to create accumulators
+        # for parameters and extend _finish_update method to add custom ops.
+
+        # Allways called under program_guard use global block as loss block
+        # But if current block is in control flow, append optimize op in the
+        # grad block of current block
+
+        global_block = framework.default_main_program().global_block()
+        target_block = global_block
+        current_block = framework.default_main_program().current_block()
+        if current_block.idx != global_block.idx:
+            assert current_block.backward_block_idx != -1, \
+                "current block is not global_block, but it doesn't have backward block."
+            target_block = framework.default_main_program().blocks[
+                current_block.backward_block_idx]
+
+        start = len(target_block.ops)
+        self.helper = LayerHelper(self.__class__.__name__)
+        self._update_param_device_map(parameters_and_grads, target_block)
+        self._create_accumulators(
+            target_block,
+            [p[0] for p in parameters_and_grads if p[0].trainable])
+        self._create_global_learning_rate()
+
+        if framework.in_dygraph_mode():
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
+                if param_and_grad[0].trainable is True:
+                    self._append_optimize_op(target_block, param_and_grad)
+        else:
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
+                with param_and_grad[0].block.program._optimized_guard(
+                        param_and_grad), name_scope("optimizer"):
+                    if param_and_grad[0].trainable is True:
+                        device = self._get_device_for_param(param_and_grad[0]
+                                                            .name)
+                        with device_guard(device):
+                            optimize_op = self._append_optimize_op(
+                                target_block, param_and_grad)
+
+        # Get custom finish ops for subclasses
+        # FIXME: Need to fix this once we figure out how to handle dependencies
+        self._finish_update(target_block, parameters_and_grads)
+
+        end = len(target_block.ops)
+        return target_block._slice_ops(start, end)
+
+    def _append_dgc_ops(self, param_and_grad):
+        pass
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        """
+        The first part of ``minimize``, do auto-diff to append backward operations for
+        the current program.
+
+        Args:
+            loss (Tensor): ``loss`` tensor to run optimizations.
+            startup_program (Program, optional): :ref:`api_fluid_Program` for
+                initializing parameters in ``parameters``. The default value
+                is None, at this time :ref:`api_fluid_default_startup_program` will be used.
+            parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
+                to minimize ``loss``. The default value is None, at this time all parameters
+                will be updated.
+            no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
+                to be updated. The default value is None.
+            callbacks (list, optional): list of callable objects to run when appending backward
+                operator for one parameter. The default value is None.
+
+        Return:
+            list: list of (param, grad) tensor pairs, param is ``Parameter``,
+                grad is the gradient value corresponding to the parameter.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+                paddle.disable_static()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                # This can be any optimizer supported by dygraph.
+                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                adam.clear_grad()
+        """
+        act_no_grad_set = None
+        if framework.in_dygraph_mode():
+            pass
+        else:
+            act_no_grad_set = self._get_no_grad_set(loss, no_grad_set)
+
+        self._dtype = loss.dtype
+        if framework.in_dygraph_mode():
+            params_grads = []
+            for param in self._parameter_list:
+                if not param.trainable:
+                    continue
+                if param._grad_ivar() is not None:
+                    # create gradient tensor
+                    grad_var = param._grad_ivar()
+                    params_grads.append((param, grad_var))
+        else:
+            if callbacks is None:
+                callbacks = [error_clip_callback]
+            else:
+                assert (isinstance(callbacks, list))
+            program = loss.block.program
+            assert len(loss.shape) == 1 and loss.shape[0] == 1, \
+                "The loss.shape should be (1L,), but the current loss.shape is {}. " \
+                "Maybe that you should call paddle.mean to process the current loss.".format(
+                    loss.shape)
+            parameter_list = parameters if parameters \
+                else self._parameter_list
+            with program_guard(program, startup_program):
+                params_grads = append_backward(loss, parameter_list,
+                                               act_no_grad_set, callbacks)
+                # Note: since we can't use all_reduce_op now,
+                #  dgc_op should be the last op of one grad.
+                self._append_dgc_ops(params_grads)
+        return params_grads
+
+    def apply_gradients(self, params_grads):
+        """
+        Second part of `minimize`, appending optimization operators for
+        given `params_grads` pairs.
+
+        Args:
+            params_grads (list): list of (param, grad) pair to do optimization.
+
+        Returns:
+            list: A list of operators appended to the current program.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+
+                paddle.disable_static()
+                inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+                linear = paddle.nn.Linear(10, 10)
+                inp = paddle.to_tensor(inp)
+                out = linear(inp)
+                loss = paddle.mean(out)
+                optimizer = paddle.optimizer.Adam(learning_rate=0.1,
+                        parameters=linear.parameters())
+                params_grads = optimizer.backward(loss)
+                optimizer.apply_gradients(params_grads)
+
+        """
+
+        params_grads = sorted(params_grads, key=lambda x: x[0].name)
+
+        # 'optimizer(grad_clip)' or 'set_gradient_clip'
+        if self._grad_clip is not None:
+            params_grads = self._grad_clip(params_grads)
+        else:
+
+            params_grads = append_gradient_clip_ops(params_grads)
+
+        # Add regularization if any
+        params_grads = append_regularization_ops(params_grads,
+                                                 self.regularization)
+
+        optimize_ops = self._create_optimization_pass(params_grads)
+        return optimize_ops
+
+    def _apply_optimize(self, loss, startup_program, params_grads):
+        """
+        Second part of `minimize`, appending optimization operators for
+        given `params_grads` pairs.
+        Args:
+            loss (Tensor): loss tensor to run optimizations.
+            startup_program (Program): startup_program for initializing parameters
+                in `parameters`.
+            params_grads (list): list of (param, grad) pair to do optimization.
+        Returns:
+            list: A list of operators appended to the current program.
+        """
+        if framework.in_dygraph_mode():
+            with program_guard(framework.default_main_program(),
+                               framework.default_startup_program()):
+                if self._grad_clip is not None:
+                    params_grads = self._grad_clip(params_grads)
+                params_grads = append_regularization_ops(params_grads,
+                                                         self.regularization)
+                optimize_ops = self._create_optimization_pass(params_grads)
+        else:
+            program = loss.block.program
+            with program_guard(program, startup_program):
+                optimize_ops = self.apply_gradients(params_grads)
+        return optimize_ops
+
+    def _get_no_grad_set(self, loss, no_grad_set=None):
+        no_grad_set = _get_no_grad_set_name(no_grad_set)
+        parameters = loss.block.program.global_block().all_parameters()
+        param_no_trainable = set(
+            [param.name for param in parameters if param.trainable is False])
+        # If the parameter is no trainable, it should not have a gradient.
+        no_grad_set.update(param_no_trainable)
+
+        return no_grad_set
+
+    @framework.dygraph_only
+    def clear_grad(self):
+        """
+        Clear the gradients of all optimized parameters for model.
+        
+        Returns:
+            None
+        
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                paddle.disable_static()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                # This can be any optimizer supported by dygraph.
+                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                adam.clear_grad()
+
+        """
+        for p in self._parameter_list:
+            if p.trainable:
+                p.clear_gradient()
+
+    @imperative_base.no_grad()
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None):
+        """
+        Add operations to minimize ``loss`` by updating ``parameters``.
+
+        Args:
+            loss (Tensor): A ``Tensor`` containing the value to minimize.
+            startup_program (Program, optional): :ref:`api_fluid_Program` for
+                initializing parameters in ``parameters``. The default value
+                is None, at this time :ref:`api_fluid_default_startup_program` will be used.
+            parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
+                to minimize ``loss``. The default value is None, at this time all parameters
+                will be updated.
+            no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
+                to be updated. The default value is None.
+
+        Returns:
+            tuple: tuple (optimize_ops, params_grads), A list of operators appended
+            by minimize and a list of (param, grad) tensor pairs, param is
+            ``Parameter``, grad is the gradient value corresponding to the parameter.
+            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
+            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            ``fetch_list`` before run, see details in ``Executor``.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.fluid as fluid
+
+                place = fluid.CPUPlace()
+                main = fluid.Program()
+                with fluid.program_guard(main):
+                    x = fluid.data(name='x', shape=[None, 13], dtype='float32')
+                    y = fluid.data(name='y', shape=[None, 1], dtype='float32')
+                    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+                    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+                    avg_cost = fluid.layers.mean(cost)
+
+                    adam_optimizer = paddle.optimizer.Adam(0.01)
+                    adam_optimizer.minimize(avg_cost)
+
+                    fetch_list = [avg_cost]
+                    train_reader = paddle.batch(
+                        paddle.dataset.uci_housing.train(), batch_size=1)
+                    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+                    exe = fluid.Executor(place)
+                    exe.run(fluid.default_startup_program())
+                    for data in train_reader():
+                        exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+        """
+        assert isinstance(loss, Variable), "The loss should be an Tensor."
+
+        parameter_list = parameters if parameters \
+            else self._parameter_list
+        params_grads = self.backward(
+            loss,
+            startup_program=startup_program,
+            parameters=parameter_list,
+            no_grad_set=no_grad_set)
+
+        optimize_ops = self._apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
+        return optimize_ops, params_grads
+
+    @framework.dygraph_only
+    def step(self):
+        """
+        Execute the optimizer once.
+        
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+                paddle.disable_static()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                # This can be any optimizer supported by dygraph.
+                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                adam.clear_grad()
+        """
+        parameter_list = self._parameter_list
+        self._dtype = None
+        params_grads = []
+        for param in self._parameter_list:
+            if not param.trainable:
+                continue
+            if param._grad_ivar() is not None:
+                grad_var = param._grad_ivar()
+                params_grads.append((param, grad_var))
+
+        optimize_ops = self._apply_optimize(
+            loss=None, startup_program=None, params_grads=params_grads)
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bc4c9bfd53dc15449f03d6de6c8942e977bf562
--- /dev/null
+++ b/python/paddle/optimizer/rmsprop.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable
+
+__all__ = ["RMSProp"]
+
+
+class RMSProp(Optimizer):
+    """
+    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
+    rate method. The original slides proposed RMSProp: Slide 29 of
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
+
+    The original equation is as follows:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+
+    The first equation calculates moving average of the squared gradient for
+    each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
+
+    In some cases, adding a momentum term :math: `\\beta` is beneficial.
+    In our implementation, Nesterov momentum is used:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w & = w - v(w, t)
+
+    if centered is True:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
+
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w & = w - v(w, t)
+
+    where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95
+    and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
+    smoothing term to avoid division by zero, usually set somewhere in range
+    from 1e-4 to 1e-8.
+
+
+    Parameters:
+        learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``.
+            It can be a float value or a LearningRateDecay.
+        rho(float): rho is :math: `\\rho` in equation, default is 0.95.
+        epsilon(float): :math: `\\epsilon` in equation is smoothing term to
+            avoid division by zero, default is 1e-6.
+        momentum(float): :math:`\\beta` in equation is the momentum term,
+            default is 0.0.
+        centered(bool): If True, gradients are normalized by the estimated variance of
+            the gradient; if False, by the uncentered second moment. Setting this to
+            True may help with training, but is slightly more expensive in terms of
+            computation and memory. Defaults to False.
+	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): This parameter is used by developers to print debugging information. \
+            For details, please refer to :ref:`api_guide_Name`. Default is None.
+
+    Raises:
+        ValueError: If learning_rate, rho, epsilon, momentum are None.
+
+    Examples:
+          .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+            adam = paddle.optimizer.RMSProp(learning_rate=0.1,
+                    parameters=linear.parameters(),
+                    weight_decay=0.01)
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+
+    _momentum_acc_str = "momentum"
+    _mean_square_acc_str = "mean_square"
+    _mean_grad_acc_str = "mean_grad"
+
+    def __init__(self,
+                 learning_rate,
+                 rho=0.95,
+                 epsilon=1.0e-6,
+                 momentum=0.0,
+                 centered=False,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        if rho is None:
+            raise ValueError("rho is not set.")
+        if epsilon is None:
+            raise ValueError("epsilon is not set.")
+        if momentum is None:
+            raise ValueError("momentum is not set.")
+
+        super(RMSProp, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+
+        self.type = "rmsprop"
+        self._rho = rho
+        self._epsilon = epsilon
+        self._momentum = momentum
+        self._centered = centered
+
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        for p in parameters:
+            self._add_accumulator(self._momentum_acc_str, p)
+            self._add_accumulator(self._mean_square_acc_str, p)
+            self._add_accumulator(self._mean_grad_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        momentum_acc = self._get_accumulator(self._momentum_acc_str,
+                                             param_and_grad[0])
+        mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
+                                                param_and_grad[0])
+        mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
+                                              param_and_grad[0])
+        rmsprop_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": momentum_acc,
+                "MeanSquare": mean_square_acc,
+                "MeanGrad": mean_grad_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "MomentOut": momentum_acc,
+                "MeanSquareOut": mean_square_acc,
+                "MeanGradOut": mean_grad_acc
+            },
+            attrs={
+                "epsilon": self._epsilon,
+                "decay": self._rho,
+                "momentum": self._momentum,
+                "centered": self._centered
+            },
+            stop_gradient=True)
+
+        return rmsprop_op
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index d31e5173f8b7d2bd1ab89267b11bdbbad9feb518..42a28a4f04e368cf8a1c1a144639bc743234a540 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -17,9 +17,13 @@ __all__ = [
     'append_backward', 'gradients', 'Executor', 'global_scope', 'scope_guard',
     'BuildStrategy', 'CompiledProgram', 'Print', 'py_func', 'ExecutionStrategy',
     'name_scope', 'ParallelExecutor', 'program_guard', 'WeightNormParamAttr',
-    'default_main_program', 'default_startup_program', 'Program', 'save', 'load'
+    'default_main_program', 'default_startup_program', 'Program', 'save',
+    'load', 'data', 'InputSpec'
 ]
 
+from . import nn
+from .input import data  #DEFINE_ALIAS
+from .input import InputSpec  #DEFINE_ALIAS
 from ..fluid.executor import Executor  #DEFINE_ALIAS
 from ..fluid.executor import global_scope  #DEFINE_ALIAS
 from ..fluid.executor import scope_guard  #DEFINE_ALIAS
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb70320ea7551de6e1117900e3769f000fdf23dd
--- /dev/null
+++ b/python/paddle/static/input.py
@@ -0,0 +1,330 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+
+import paddle
+from paddle.fluid import core, Variable
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.data_feeder import check_type
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+__all__ = ['data', 'InputSpec']
+
+
+def data(name, shape, dtype=None, lod_level=0):
+    """
+    **Data Layer**
+
+    This function creates a variable on the global block. The global variable
+    can be accessed by all the following operators in the graph. The variable
+    is a placeholder that could be fed with input, such as Executor can feed
+    input into the variable. When `dtype` is None, the dtype
+    will get from the global dtype by `paddle.get_default_dtype()`.
+
+    Args:
+       name (str): The name/alias of the variable, see :ref:`api_guide_Name`
+           for more details.
+       shape (list|tuple): List|Tuple of integers declaring the shape. You can
+           set "None" or -1 at a dimension to indicate the dimension can be of any
+           size. For example, it is useful to set changeable batch size as "None" or -1.
+       dtype (np.dtype|str, optional): The type of the data. Supported
+           dtype: bool, float16, float32, float64, int8, int16, int32, int64,
+           uint8. Default: None. When `dtype` is not set, the dtype will get
+           from the global dtype by `paddle.get_default_dtype()`.
+       lod_level (int, optional): The LoD level of the LoDTensor. Usually users
+           don't have to set this value. For more details about when and how to
+           use LoD level, see :ref:`user_guide_lod_tensor` . Default: 0.
+
+    Returns:
+        Variable: The global variable that gives access to the data.
+
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle
+
+          # Creates a variable with fixed size [3, 2, 1]
+          # User can only feed data of the same shape to x
+          # the dtype is not set, so it will set "float32" by
+          # paddle.get_default_dtype(). You can use paddle.get_default_dtype() to
+          # change the global dtype
+          x = paddle.static.data(name='x', shape=[3, 2, 1])
+
+          # Creates a variable with changeable batch size -1.
+          # Users can feed data of any batch size into y,
+          # but size of each data sample has to be [2, 1]
+          y = paddle.static.data(name='y', shape=[-1, 2, 1], dtype='float32')
+
+          z = x + y
+
+          # In this example, we will feed x and y with np-ndarray "1"
+          # and fetch z, like implementing "1 + 1 = 2" in PaddlePaddle
+          feed_data = np.ones(shape=[3, 2, 1], dtype=np.float32)
+
+          exe = paddle.static.Executor(paddle.framework.CPUPlace())
+          out = exe.run(paddle.static.default_main_program(),
+                        feed={
+                            'x': feed_data,
+                            'y': feed_data
+                        },
+                        fetch_list=[z.name])
+
+          # np-ndarray of shape=[3, 2, 1], dtype=float32, whose elements are 2
+          print(out)
+
+    """
+    helper = LayerHelper('data', **locals())
+    check_type(name, 'name', (six.binary_type, six.text_type), 'data')
+    check_type(shape, 'shape', (list, tuple), 'data')
+
+    shape = list(shape)
+    for i in six.moves.range(len(shape)):
+        if shape[i] is None:
+            shape[i] = -1
+
+    if dtype:
+        return helper.create_global_variable(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            stop_gradient=True,
+            lod_level=lod_level,
+            is_data=True,
+            need_check_feed=True)
+    else:
+        return helper.create_global_variable(
+            name=name,
+            shape=shape,
+            dtype=paddle.get_default_dtype(),
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            stop_gradient=True,
+            lod_level=lod_level,
+            is_data=True,
+            need_check_feed=True)
+
+
+class InputSpec(object):
+    """
+    InputSpec describes the signature information of the model input, such as ``shape`` , ``dtype`` , ``name`` .
+
+    This interface is often used to specify input tensor information of models in high-level API.
+    It's also used to specify the tensor information for each input parameter of the forward function
+    decorated by `@paddle.jit.to_static`.
+
+    Args:
+        shape (tuple(integers)|list[integers]): List|Tuple of integers
+            declaring the shape. You can set "None" or -1 at a dimension
+            to indicate the dimension can be of any size. For example,
+            it is useful to set changeable batch size as "None" or -1.
+        dtype (np.dtype|str, optional): The type of the data. Supported
+            dtype: bool, float16, float32, float64, int8, int16, int32, int64,
+            uint8. Default: float32.
+        name (str): The name/alias of the variable, see :ref:`api_guide_Name`
+            for more details.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.static import InputSpec
+
+            input = InputSpec([None, 784], 'float32', 'x')
+            label = InputSpec([None, 1], 'int64', 'label')
+
+            print(input)  # InputSpec(shape=(-1, 784), dtype=VarType.FP32, name=x)
+            print(label)  # InputSpec(shape=(-1, 1), dtype=VarType.INT64, name=label)
+    """
+
+    def __init__(self, shape, dtype='float32', name=None):
+        # replace `None` in shape  with -1
+        self.shape = self._verify(shape)
+        # convert dtype into united represention
+        if dtype is not None:
+            if not isinstance(dtype, core.VarDesc.VarType):
+                dtype = convert_np_dtype_to_dtype_(dtype)
+        self.dtype = dtype
+        self.name = name
+
+    def _create_feed_layer(self):
+        return data(self.name, shape=self.shape, dtype=self.dtype)
+
+    def __repr__(self):
+        return '{}(shape={}, dtype={}, name={})'.format(
+            type(self).__name__, self.shape, self.dtype, self.name)
+
+    @classmethod
+    def from_tensor(cls, tensor, name=None):
+        """
+        Generates a InputSpec based on the description of input tensor.
+
+        Args:
+            tensor(Tensor): the source tensor to generate a InputSpec instance
+
+        Returns:
+            A InputSpec instance generated from Tensor.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                from paddle.static import InputSpec
+
+                paddle.disable_static()
+
+                x = paddle.to_tensor(np.ones([2, 2], np.float32))
+                x_spec = InputSpec.from_tensor(x, name='x')
+                print(x_spec)  # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x)
+
+        """
+        if isinstance(tensor, (Variable, core.VarBase)):
+            return cls(tensor.shape, tensor.dtype, name or tensor.name)
+        else:
+            raise ValueError(
+                "Input `tensor` should be a Tensor, but received {}.".format(
+                    type(tensor).__name__))
+
+    @classmethod
+    def from_numpy(cls, ndarray, name=None):
+        """
+        Generates a InputSpec based on the description of input np.ndarray.
+
+        Args:
+            tensor(Tensor): the source numpy ndarray to generate a InputSpec instance
+
+        Returns:
+            A InputSpec instance generated from Tensor.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                from paddle.static import InputSpec
+
+                x = np.ones([2, 2], np.float32)
+                x_spec = InputSpec.from_numpy(x, name='x')
+                print(x_spec)  # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x)
+
+        """
+        return cls(ndarray.shape, ndarray.dtype, name)
+
+    def batch(self, batch_size):
+        """
+        Inserts `batch_size` in front of the `shape`.
+
+        Args:
+            batch_size(int): the inserted integer value of batch size.
+
+        Returns:
+            The original InputSpec instance by inserting `batch_size` in front of `shape`.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.static import InputSpec
+
+                x_spec = InputSpec(shape=[64], dtype='float32', name='x')
+                x_spec.batch(4)
+                print(x_spec) # InputSpec(shape=(4, 64), dtype=VarType.FP32, name=x)
+
+        """
+        if isinstance(batch_size, (list, tuple)):
+            if len(batch_size) != 1:
+                raise ValueError(
+                    "Length of batch_size: {} shall be 1, but received {}.".
+                    format(batch_size, len(batch_size)))
+            batch_size = batch_size[1]
+        elif not isinstance(batch_size, six.integer_types):
+            raise TypeError("type(batch_size) shall be `int`, but received {}.".
+                            format(type(batch_size).__name__))
+
+        new_shape = [batch_size] + list(self.shape)
+        self.shape = tuple(new_shape)
+
+        return self
+
+    def unbatch(self):
+        """
+        Removes the first element of `shape`.
+
+        Returns:
+            The original InputSpec instance by removing the first element of `shape` .
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.static import InputSpec
+
+                x_spec = InputSpec(shape=[4, 64], dtype='float32', name='x')
+                x_spec.unbatch()
+                print(x_spec) # InputSpec(shape=(64,), dtype=VarType.FP32, name=x)
+
+        """
+        if len(self.shape) == 0:
+            raise ValueError(
+                "Not support to unbatch a InputSpec when len(shape) == 0.")
+
+        self.shape = self._verify(self.shape[1:])
+        return self
+
+    def _verify(self, shape):
+        """
+        Verifies the input shape and modifies `None` into `-1`.
+        """
+        if not isinstance(shape, (list, tuple)):
+            raise TypeError(
+                "Type of `shape` in InputSpec should be one of (tuple, list), but received {}.".
+                format(type(shape).__name__))
+        if len(shape) == 0:
+            raise ValueError(
+                "`shape` in InputSpec should contain at least 1 element, but received {}.".
+                format(shape))
+
+        for i, ele in enumerate(shape):
+            if ele is not None:
+                if not isinstance(ele, six.integer_types):
+                    raise ValueError(
+                        "shape[{}] should be an `int`, but received `{}`:{}.".
+                        format(i, type(ele).__name__, ele))
+            if ele is None or ele < -1:
+                shape[i] = -1
+
+        return tuple(shape)
+
+    def __hash__(self):
+        # Note(Aurelius84): `name` is not considered as a field to compute hashkey.
+        # Because it's no need to generate a new program in following cases while using
+        # @paddle.jit.to_static.
+        #
+        # Case 1:
+        #      foo(x_var)
+        #      foo(y_var)
+        #  x_var and y_var hold same shape and dtype, they should share a same program.
+        #
+        #
+        # Case 2:
+        #      foo(x_var)
+        #      foo(x_np)  # x_np is a numpy.ndarray.
+        #  x_var and x_np hold same shape and dtype, they should also share a same program.
+        return hash((tuple(self.shape), self.dtype))
+
+    def __eq__(self, other):
+        slots = ['shape', 'dtype', 'name']
+        return (type(self) is type(other) and all(
+            getattr(self, attr) == getattr(other, attr) for attr in slots))
+
+    def __ne__(self, other):
+        return not self == other
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
old mode 100644
new mode 100755
index 77d821d56b8e1d7b6953dcd9a23a5e8a8b3b55ff..0fed32a1676759bd94961af0a8949d035ec48c8f
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -58,7 +58,7 @@ from .logic import equal  #DEFINE_ALIAS
 from .logic import greater_equal  #DEFINE_ALIAS
 from .logic import greater_than  #DEFINE_ALIAS
 from .logic import is_empty  #DEFINE_ALIAS
-from .logic import isfinite  #DEFINE_ALIAS
+#from .logic import isfinite  #DEFINE_ALIAS
 from .logic import less_equal  #DEFINE_ALIAS
 from .logic import less_than  #DEFINE_ALIAS
 from .logic import logical_and  #DEFINE_ALIAS
@@ -111,6 +111,7 @@ from .math import cumsum  #DEFINE_ALIAS
 from .math import elementwise_add  #DEFINE_ALIAS
 from .math import elementwise_div  #DEFINE_ALIAS
 from .math import elementwise_floordiv  #DEFINE_ALIAS
+from .math import elementwise_mul  #DEFINE_ALIAS
 from .math import elementwise_mod  #DEFINE_ALIAS
 from .math import elementwise_pow  #DEFINE_ALIAS
 from .math import elementwise_sub  #DEFINE_ALIAS
@@ -143,7 +144,11 @@ from .math import maximum  #DEFINE_ALIAS
 from .math import min  #DEFINE_ALIAS
 from .math import minimum  #DEFINE_ALIAS
 from .math import mm  #DEFINE_ALIAS
-from .math import div  #DEFINE_ALIAS
+from .math import divide  #DEFINE_ALIAS
+from .math import floor_divide  #DEFINE_ALIAS
+from .math import remainder  #DEFINE_ALIAS
+from .math import mod  #DEFINE_ALIAS
+from .math import floor_mod  #DEFINE_ALIAS
 from .math import multiply  #DEFINE_ALIAS
 from .math import add  #DEFINE_ALIAS
 from .math import atan  #DEFINE_ALIAS
@@ -153,11 +158,16 @@ from .math import log1p  #DEFINE_ALIAS
 from .math import erf  #DEFINE_ALIAS
 from .math import addcmul  #DEFINE_ALIAS
 from .math import addmm  #DEFINE_ALIAS
-from .math import clamp  #DEFINE_ALIAS
+from .math import clip  #DEFINE_ALIAS
 from .math import trace  #DEFINE_ALIAS
 from .math import kron  #DEFINE_ALIAS
-# from .random import gaussin        #DEFINE_ALIAS
-# from .random import uniform        #DEFINE_ALIAS
+from .math import isfinite  #DEFINE_ALIAS
+from .math import isinf  #DEFINE_ALIAS
+from .math import isnan  #DEFINE_ALIAS
+from .math import prod  #DEFINE_ALIAS
+from .random import standard_normal
+from .random import normal
+from .random import uniform  #DEFINE_ALIAS
 from .random import shuffle  #DEFINE_ALIAS
 from .random import randn  #DEFINE_ALIAS
 from .random import rand  #DEFINE_ALIAS
@@ -175,9 +185,12 @@ from .search import index_select  #DEFINE_ALIAS
 from .search import nonzero  #DEFINE_ALIAS
 from .search import sort  #DEFINE_ALIAS
 from .search import index_sample  #DEFINE_ALIAS
+from .search import masked_select  #DEFINE_ALIAS
 from .stat import mean  #DEFINE_ALIAS
 from .stat import reduce_mean  #DEFINE_ALIAS
 from .stat import std  #DEFINE_ALIAS
 from .stat import var  #DEFINE_ALIAS
+from .stat import numel  #DEFINE_ALIAS
+# from .tensor import Tensor        #DEFINE_ALIAS
 # from .tensor import LoDTensor        #DEFINE_ALIAS
 # from .tensor import LoDTensorArray        #DEFINE_ALIAS
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 55bf1344014c3de963a001e95c940100f49a8300..9ef66712540aa54eac39b7e6160c5c91b6e3fcd5 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -29,7 +29,6 @@ from paddle.common_ops_import import *
 
 # TODO: define functions to get create a tensor  
 from ..fluid.layers import crop_tensor  #DEFINE_ALIAS
-from ..fluid.layers import diag  #DEFINE_ALIAS
 from ..fluid.layers import fill_constant  #DEFINE_ALIAS
 from ..fluid.layers import linspace  #DEFINE_ALIAS
 import paddle
@@ -72,22 +71,22 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     Args:
         data(scalar|tuple|list|ndarray|Tensor|ComplexTensor): Initial data for the tensor.
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor, paddle\.ComplexTensor.
-        dtype(str, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
+        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
             'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8'. And
-            'complex64' , 'complex128' only for ComplexTensor.
-            Default: None, infers data type from ``data`` .
+            'complex64' , 'complex128' only for ComplexTensor. Default: None, for float point number, 
+            get type from ``get_default_type``, for other type, infers from ``data`` .
         place(CPUPlace|CUDAPinnedPlace|CUDAPlace, optional): The place to allocate Tensor. Can be  
             CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place.
         stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
 
     Returns:
-        Tensor: A Tensor or ComplexTensor constructed from ``data``.
+        Tensor: A Tensor or ComplexTensor constructed from ``data`` .
 
     Raises:
         TypeError: If the data type of ``data`` is not scalar, list, tuple, numpy.ndarray, paddle.Tensor, paddle.ComplexTensor
         ValueError: If ``data`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]
         TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128
-        ValueError: If ``place`` is not paddle.Place, paddle.CUDAPinnedPlace, paddle.CUDAPlace
+        ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace
 
     Examples:
 
@@ -95,7 +94,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
 
         import paddle
         import numpy as np
-        paddle.enable_imperative()
+        paddle.disable_static()
                 
         type(paddle.to_tensor(1))
         # <class 'paddle.Tensor'>
@@ -133,7 +132,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
         #   - dtype: double
         #   - data: [0.1 0.2 0.3 0.4]
 
-        type(paddle.to_tensor([[1+1j, 2], [3+2j, 4]]), , dtype='complex64')
+        type(paddle.to_tensor([[1+1j, 2], [3+2j, 4]]), dtype='complex64')
         # <class 'paddle.ComplexTensor'>
 
         paddle.to_tensor([[1+1j, 2], [3+2j, 4]], dtype='complex64')
@@ -190,12 +189,13 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
                 "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor|paddle.ComplexTensor".
                 format(type(data)))
 
-    if dtype:
-        dtype = convert_dtype(dtype)
-        if dtype != data.dtype:
-            data = data.astype(dtype)
-
     if not np.iscomplexobj(data):
+        if dtype:
+            dtype = convert_dtype(dtype)
+        elif data.dtype in ['float16', 'float32', 'float64']:
+            dtype = paddle.framework.get_default_dtype()
+        if dtype and dtype != data.dtype:
+            data = data.astype(dtype)
         return paddle.Tensor(
             value=data,
             place=place,
@@ -203,6 +203,14 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             zero_copy=True,
             stop_gradient=stop_gradient)
     else:
+        if dtype:
+            dtype = convert_dtype(dtype)
+        else:
+            dtype = paddle.framework.get_default_dtype()
+            dtype = 'complex64' if dtype in ['float16', 'float32'
+                                             ] else 'complex128'
+        if dtype != data.dtype:
+            data = data.astype(dtype)
         name = unique_name.generate('generated_tensor')
         real_tensor = paddle.Tensor(
             value=data.real,
@@ -221,8 +229,6 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
 
 def full_like(x, fill_value, dtype=None, name=None):
     """
-	:alias_main: paddle.full_like
-	:alias: paddle.tensor.full_like, paddle.tensor.creation.full_like
 
     This function creates a tensor filled with ``fill_value`` which has identical shape of ``x`` and ``dtype``.
     If the ``dtype`` is None, the data type of Tensor is same with ``x``.
@@ -230,7 +236,7 @@ def full_like(x, fill_value, dtype=None, name=None):
     Args:
         x(Tensor): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
         fill_value(bool|float|int): The value to fill the tensor with. Note: this value shouldn't exceed the range of the output data type.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of output. The data type can be one
+        dtype(np.dtype|str, optional): The data type of output. The data type can be one
             of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
             data type is the same as input.
         name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
@@ -285,14 +291,12 @@ def full_like(x, fill_value, dtype=None, name=None):
 
 def ones(shape, dtype=None, name=None):
     """
-	:alias_main: paddle.ones
-	:alias: paddle.tensor.ones, paddle.tensor.creation.ones
 
     The OP creates a tensor of specified :attr:`shape` and :attr:`dtype`, and fills it with 1.
 
     Args:
         shape(tuple|list|Tensor): Shape of the Tensor to be created, the data type of shape is int32 or int64.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of output Tensor, it supports
+        dtype(np.dtype|str, optional): Data type of output Tensor, it supports
             bool, float16, float32, float64, int32 and int64. Default: if None, the data type is 'float32'.
         name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
     
@@ -361,14 +365,14 @@ def ones_like(x, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
-        out1 = paddle.zeros_like(x) # [1., 1., 1.]
-        out2 = paddle.zeros_like(x, dtype='int32') # [1, 1, 1]
+            x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
+            out1 = paddle.zeros_like(x) # [1., 1., 1.]
+            out2 = paddle.zeros_like(x, dtype='int32') # [1, 1, 1]
 
     """
     return full_like(x=x, fill_value=1, dtype=dtype, name=name)
@@ -376,14 +380,11 @@ def ones_like(x, dtype=None, name=None):
 
 def zeros(shape, dtype=None, name=None):
     """
-	:alias_main: paddle.zeros
-	:alias: paddle.tensor.zeros, paddle.tensor.creation.zeros
-
     The OP creates a tensor of specified :attr:`shape` and :attr:`dtype`, and fills it with 0.
 
     Args:
         shape(tuple|list|Tensor): Shape of the Tensor to be created, the data type of ``shape`` is int32 or int64.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of output Tensor, it supports
+        dtype(np.dtype|str, optional): Data type of output Tensor, it supports
             bool, float16, float32, float64, int32 and int64. Default: if None, the date type is float32.
         name(str, optional): The default value is None.  Normally there is no need for user to set this
             property.  For more information, please refer to :ref:`api_guide_Name`.
@@ -451,14 +452,14 @@ def zeros_like(x, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
-        out1 = paddle.zeros_like(x) # [0., 0., 0.]
-        out2 = paddle.zeros_like(x, dtype='int32') # [0, 0, 0]
+            x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
+            out1 = paddle.zeros_like(x) # [0., 0., 0.]
+            out2 = paddle.zeros_like(x, dtype='int32') # [0, 0, 0]
 
     """
     return full_like(x=x, fill_value=0, dtype=dtype, name=name)
@@ -466,8 +467,6 @@ def zeros_like(x, dtype=None, name=None):
 
 def eye(num_rows, num_columns=None, dtype=None, name=None):
     """
-	:alias_main: paddle.eye
-	:alias: paddle.tensor.eye, paddle.tensor.creation.eye
     
     This function constructs 2-D Tensor with ones on the diagonal and zeros elsewhere.
 
@@ -475,7 +474,7 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
         num_rows(int): the number of rows in each batch Tensor.
         num_columns(int, optional): the number of columns in each batch Tensor.
             If None, default: num_rows.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of the returned Tensor.
+        dtype(np.dtype|str, optional): The data type of the returned Tensor.
             It should be int32, int64, float16, float32, float64. Default: if None, the data type
             is float32.
         name(str, optional): The default value is None.  Normally there is no need for 
@@ -516,8 +515,6 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
 
 def full(shape, fill_value, dtype=None, name=None):
     """
-	:alias_main: paddle.full
-	:alias: paddle.tensor.full, paddle.tensor.creation.full
 
     This Op return a Tensor with the ``fill_value`` which size is same as ``shape``.
     
@@ -528,7 +525,7 @@ def full(shape, fill_value, dtype=None, name=None):
                 If ``shape`` is an Tensor, it should be an 1-D Tensor .
         fill_value(bool|float|int|Tensor): The constant value
             used to initialize the Tensor to be created. If ``fill_value`` is an Tensor, it must be an 1-D Tensor.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of the output Tensor
+        dtype(np.dtype|str, optional): Data type of the output Tensor
             which can be float16, float32, float64, int32, int64, if dytpe is `None`, the data
             type of created Tensor is `float32`
         name(str, optional): The default value is None.  Normally there is no need for user to set this
@@ -914,3 +911,99 @@ def meshgrid(*args, **kwargs):
         type='meshgrid', inputs={'X': list(args)}, outputs={'Out': out})
 
     return out
+
+
+def diag(x, offset=0, padding_value=0, name=None):
+    """
+    If ``x`` is a vector (1-D tensor), a 2-D square tensor whth the elements of ``x`` as the diagonal is returned.
+
+    If ``x`` is a matrix (2-D tensor), a 1-D tensor with the diagonal elements of ``x`` is returned.
+
+    The argument ``offset`` controls the diagonal offset:
+
+    If ``offset`` = 0, it is the main diagonal.
+
+    If ``offset`` > 0, it is superdiagonal.
+
+    If ``offset`` < 0, it is subdiagonal.
+
+    Args:
+        x (Tensor): The input tensor. Its shape is either 1-D or 2-D. Its data type should be float32, float64, int32, int64.
+        offset (int, optional): The diagonal offset. A positive value represents superdiagonal, 0 represents the main diagonal, and a negative value represents subdiagonal.
+        padding_value (int|float, optional): Use this value to fill the area outside the specified diagonal band. Only takes effect when the input is a 1-D Tensor. The default value is 0.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, a square matrix or a vector. The output data type is the same as input data type.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+
+          paddle.disable_static()
+          x = paddle.to_tensor([1, 2, 3])
+          y = paddle.diag(x)
+          print(y.numpy())
+          # [[1 0 0]
+          #  [0 2 0]
+          #  [0 0 3]]
+
+          y = paddle.diag(x, offset=1)
+          print(y.numpy())
+          # [[0 1 0 0]
+          #  [0 0 2 0]
+          #  [0 0 0 3]
+          #  [0 0 0 0]]
+
+          y = paddle.diag(x, padding_value=6)
+          print(y.numpy())
+          # [[1 6 6]
+          #  [6 2 6]
+          #  [6 6 3]]
+
+        .. code-block:: python
+
+          import paddle
+
+          paddle.disable_static()
+          x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
+          y = paddle.diag(x)
+          print(y.numpy())
+          # [1 5]
+
+          y = paddle.diag(x, offset=1)
+          print(y.numpy())
+          # [2 6]
+
+          y = paddle.diag(x, offset=-1)
+          print(y.numpy())
+          # [4]
+    """
+    if in_dygraph_mode():
+        return core.ops.diag_v2(x, "offset", offset, "padding_value",
+                                padding_value)
+
+    check_type(x, 'x', (Variable), 'diag_v2')
+    check_dtype(x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'],
+                'diag_v2')
+    check_type(offset, 'offset', (int), 'diag_v2')
+    check_type(padding_value, 'padding_value', (int, float), 'diag_v2')
+    if len(x.shape) != 1 and len(x.shape) != 2:
+        raise ValueError(
+            "The dimension of input x must be either 1 or 2, but received {}".
+            format(len(x.shape)))
+
+    helper = LayerHelper("diag_v2", **locals())
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(
+        type='diag_v2',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'offset': offset,
+               'padding_value': padding_value})
+
+    out.stop_gradient = True
+    return out
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 972c9fbce4d2ab11b4a0bbc4f9818f721486c741..b5b528325cd9f52a8b61ef21df0095c41da5a8ed 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
 from paddle.common_ops_import import *
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type
@@ -35,135 +36,134 @@ __all__ = [
 ]
 
 
-def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
+def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     """
-	:alias_main: paddle.matmul
-	:alias: paddle.matmul,paddle.tensor.matmul,paddle.tensor.linalg.matmul
+    Applies matrix multiplication to two tensors. `matmul` follows 
+    the complete broadcast rules, 
+    and its behavior is consistent with `np.matmul`.
 
-    Applies matrix multiplication to two tensors.
-
-    Currently, the input tensors' rank can be any, but when the rank of any
-    inputs is bigger than 3, this two inputs' rank should be equal.
+    Currently, the input tensors' number of dimensions can be any, `matmul` can be used to
+    achieve the `dot`, `matmul` and `batchmatmul`.
 
     The actual behavior depends on the shapes of :math:`x`, :math:`y` and the
     flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
 
     - If a transpose flag is specified, the last two dimensions of the tensor
-      are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for
-      :math:`x` it is treated as :math:`[1, D]` in nontransposed form and as
-      :math:`[D, 1]` in transposed form, whereas for :math:`y` it is the
-      opposite: It is treated as :math:`[D, 1]` in nontransposed form and as
-      :math:`[1, D]` in transposed form.
-
-    - After transpose, the two tensors are 2-D or n-D and matrix multiplication
-      performs in the following way.
-
-      - If both are 2-D, they are multiplied like conventional matrices.
-      - If either is n-D, it is treated as a stack of matrices residing in the
-        last two dimensions and a batched matrix multiply supporting broadcast
-        applies on the two tensors.
-
-    Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and
-    nontransposed, the prepended or appended dimension :math:`1` will be
-    removed after matrix multiplication.
+      are transposed. If the tensor is ndim-1 of shape, the transpose is invalid. If the tensor 
+      is ndim-1 of shape :math:`[D]`, then for :math:`x` it is treated as :math:`[1, D]`, whereas 
+      for :math:`y` it is the opposite: It is treated as :math:`[D, 1]`.
+
+    The multiplication behavior depends on the dimensions of `x` and `y`. Specifically:
+
+    - If both tensors are 1-dimensional, the dot product result is obtained.
+
+    - If both tensors are 2-dimensional, the matrix-matrix product is obtained.
+
+    - If the `x` is 1-dimensional and the `y` is 2-dimensional, 
+      a `1` is prepended to its dimension in order to conduct the matrix multiply. 
+      After the matrix multiply, the prepended dimension is removed.
+      
+    - If the `x` is 2-dimensional and `y` is 1-dimensional, 
+      the matrix-vector product is obtained.
+
+    - If both arguments are at least 1-dimensional and at least one argument 
+      is N-dimensional (where N > 2), then a batched matrix multiply is obtained. 
+      If the first argument is 1-dimensional, a 1 is prepended to its dimension 
+      in order to conduct the batched matrix multiply and removed after. 
+      If the second argument is 1-dimensional, a 1 is appended to its 
+      dimension for the purpose of the batched matrix multiple and removed after. 
+      The non-matrix (exclude the last two dimensions) dimensions are 
+      broadcasted according the broadcast rule. 
+      For example, if input is a (j, 1, n, m) tensor and the other is a (k, m, p) tensor, 
+      out will be a (j, k, n, p) tensor.
 
     Args:
-        x (Variable): The input variable which is a Tensor or LoDTensor.
-        y (Variable): The input variable which is a Tensor or LoDTensor.
+        x (Tensor): The input tensor which is a Tensor.
+        y (Tensor): The input tensor which is a Tensor.
         transpose_x (bool): Whether to transpose :math:`x` before multiplication.
         transpose_y (bool): Whether to transpose :math:`y` before multiplication.
-        alpha (float): The scale of output. Default 1.0.
         name(str|None): A name for this layer(optional). If set None, the layer
             will be named automatically.
 
     Returns:
-        Variable: The product Tensor (or LoDTensor) variable.
+        Tensor: The output Tensor.
 
     Examples:
-        .. code-block:: python
-
-            # Examples to clarify shapes of the inputs and output
-            # x: [B, ..., M, K], y: [B, ..., K, N]
-            # paddle.matmul(x, y)  # out: [B, ..., M, N]
-
-            # x: [B, M, K], y: [B, K, N]
-            # paddle.matmul(x, y)  # out: [B, M, N]
 
-            # x: [B, M, K], y: [K, N]
-            # paddle.matmul(x, y)  # out: [B, M, N]
-
-            # x: [M, K], y: [K, N]
-            # paddle.matmul(x, y)  # out: [M, N]
-
-            # x: [B, M, K], y: [K]
-            # paddle.matmul(x, y)  # out: [B, M]
+    .. code-block:: python
 
-            # x: [K], y: [K]
-            # paddle.matmul(x, y)  # out: [1]
+        import paddle
+        import numpy as np
 
-            # x: [M], y: [N]
-            # paddle.matmul(x, y, True, True)  # out: [M, N]
+        paddle.disable_static()
+        # vector * vector
+        x_data = np.random.random([10]).astype(np.float32)
+        y_data = np.random.random([10]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [1]
+
+        # matrix * vector
+        x_data = np.random.random([10, 5]).astype(np.float32)
+        y_data = np.random.random([5]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [10]
+
+        # batched matrix * broadcasted vector
+        x_data = np.random.random([10, 5, 2]).astype(np.float32)
+        y_data = np.random.random([2]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [10, 5]
+
+        # batched matrix * batched matrix
+        x_data = np.random.random([10, 5, 2]).astype(np.float32)
+        y_data = np.random.random([10, 2, 5]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [10, 5, 5]
+
+        # batched matrix * broadcasted matrix
+        x_data = np.random.random([10, 1, 5, 2]).astype(np.float32)
+        y_data = np.random.random([1, 3, 2, 5]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [10, 3, 5, 5]
 
-            import paddle
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[2, 3], dtype='float32')
-            y = fluid.data(name='y', shape=[3, 2], dtype='float32')
-            out = paddle.matmul(x, y, True, True)
     """
+    op_type = 'matmul_v2'
+    if in_dygraph_mode():
+        op = getattr(core.ops, op_type)
+        return op(x, y, 'trans_x', transpose_x, 'trans_y', transpose_y)
+
     attrs = {
-        'transpose_X': transpose_x,
-        'transpose_Y': transpose_y,
-        'alpha': float(alpha),
+        'trans_x': transpose_x,
+        'trans_y': transpose_y,
     }
 
-    if in_dygraph_mode():
-        out = _varbase_creator(dtype=x.dtype)
-        core.ops.matmul(x, y, out, 'transpose_X', transpose_x, 'transpose_Y',
-                        transpose_y, 'alpha', float(alpha))
-        return out
-
     def __check_input(x, y):
         var_names = {'x': x, 'y': y}
         for name, val in var_names.items():
-            check_variable_and_dtype(
-                val, name, ['float16', 'float32', 'float64'], 'matmul')
-        x_shape = list(x.shape)
-        y_shape = list(y.shape)
-        if len(x_shape) == 1:
-            x_shape = [1] + x_shape
-        if len(y_shape) == 1:
-            y_shape = y_shape + [1]
-
-        # check the inner 2 dimensions
-        if transpose_x:
-            x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2]
-        if transpose_y:
-            y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
-        if x_shape[-1] != y_shape[-2]:
-            assert (x_shape[-1] == -1) or (y_shape[-2] == -1),                         \
-                "After performing an optional transpose, Input X's width should be "   \
-                "equal to Y's width for multiplication "                               \
-                "prerequisites. But received X's shape: %s, Y's shape: %s\n" %         \
-                (x_shape, y_shape)
-
-        if len(y_shape) > 2 and len(x_shape) > 2:
-            for i, dim_x in enumerate(x_shape[:-2]):
-                # don't check neg shape
-                if dim_x < 0 or y_shape[i] < 0:
-                    continue
-                if dim_x != y_shape[i]:
-                    raise ValueError(
-                        "When the matrix is larger than 2 dimensions, the higher "
-                        "dimensional values of the two matrices need to be equal. "
-                        "But received x_shape[%d] != y_shape[%d]. X's shape: %s, "
-                        "Y's shape: %s.\n" % (i, i, x_shape, y_shape))
+            check_variable_and_dtype(val, name, ['float32', 'float64'],
+                                     'matmul')
 
     __check_input(x, y)
 
-    helper = LayerHelper('matmul', **locals())
+    helper = LayerHelper('matmul_v2', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
-        type='matmul',
+        type='matmul_v2',
         inputs={'X': x,
                 'Y': y},
         outputs={'Out': out},
@@ -171,7 +171,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
     return out
 
 
-def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
+def norm(x, p='fro', axis=None, keepdim=False, name=None):
     """
 	:alias_main: paddle.norm
 	:alias: paddle.norm,paddle.tensor.norm,paddle.tensor.linalg.norm
@@ -180,20 +180,19 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
     or 2-norm, and in general the p-norm for p > 0) of a given tensor.
 
     Args:
-        input (Variable): The input tensor could be N-D tensor, and the input data
+        x (Tensor): The input tensor could be N-D tensor, and the input data
             type could be float32 or float64.
-        p (float|string, optional): Order of the norm. Supported values are `fro`, `1`, `2`,
-            and any positive real number yielding the corresponding p-norm.
-        axis (int|list, optional): The axis on which to apply norm operation. If axis is int
-            or list with only one element, the vector norm is computed over the axis.
-            If axis is a list with two elements, the matrix norm is computed over the axis.
+        p (float|string, optional): Order of the norm. Supported values are `fro`, `0`, `1`, `2`,
+           `inf`,`-inf` and any positive real number yielding the corresponding p-norm.
+            Not supported: ord < 0, nuclear norm.
+        axis (int|list|tuple, optional): The axis on which to apply norm operation. If axis is int
+            or list(int)/tuple(int)  with only one element, the vector norm is computed over the axis.
             If `axis < 0`, the dimension to norm operation is rank(input) + axis.
+            If axis is a list(int)/tuple(int) with two elements, the matrix norm is computed over the axis.
         keepdim (bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have fewer dimension
             than the :attr:`input` unless :attr:`keepdim` is true, default
             value is False.
-        out (Variable, optional): The output tensor, default value is None. It's data type
-            must be the same as the input Tensor.
         name (str, optional): The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
@@ -209,29 +208,57 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
         .. code-block:: python
             
             import paddle
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[2, 3, 5], dtype='float64')
-            
+            import numpy as np
+            paddle.disable_static()
+            shape=[2, 3, 4]
+            np_input = np.arange(24).astype('float32') - 12
+            np_input = np_input.reshape(shape)
+            x = paddle.to_tensor(np_input)
+            #[[[-12. -11. -10.  -9.] [ -8.  -7.  -6.  -5.] [ -4.  -3.  -2.  -1.]]
+            # [[  0.   1.   2.   3.] [  4.   5.   6.   7.] [  8.   9.  10.  11.]]]
+
             # compute frobenius norm along last two dimensions.
-            out_fro = paddle.norm(x, p='fro', axis=[1,2])
-            
+            out_fro = paddle.norm(x, p='fro', axis=[0,1])
+            # out_fro.numpy() [17.435596 16.911535 16.7332   16.911535]
+
             # compute 2-order vector norm along last dimension.
             out_pnorm = paddle.norm(x, p=2, axis=-1)
+            #out_pnorm.numpy(): [[21.118711  13.190906   5.477226]
+            #                    [ 3.7416575 11.224972  19.131126]]
+
+            # compute 2-order  norm along [0,1] dimension.
+            out_pnorm = paddle.norm(x, p=2, axis=[0,1])
+            #out_pnorm.numpy(): [17.435596 16.911535 16.7332   16.911535]
+
+            # compute inf-order  norm
+            out_pnorm = paddle.norm(x, p=np.inf)
+            #out_pnorm.numpy()  = [12.]
+            out_pnorm = paddle.norm(x, p=np.inf, axis=0)
+            #out_pnorm.numpy(): [[12. 11. 10. 9.] [8. 7. 6. 7.] [8. 9. 10. 11.]]
+
+            # compute -inf-order  norm
+            out_pnorm = paddle.norm(x, p=-np.inf)
+            #out_pnorm.numpy(): [0.]
+            out_pnorm = paddle.norm(x, p=-np.inf, axis=0)
+            #out_pnorm.numpy(): [[0. 1. 2. 3.] [4. 5. 6. 5.] [4. 3. 2. 1.]]
     """
 
-    def frobenius_norm(input, dim=None, keepdim=False, out=None, name=None):
+    def frobenius_norm(input, dim=None, keepdim=False, name=None):
         """
         The frobenius norm OP is to calculate the frobenius norm of certain two dimensions of Tensor `input`.
         Args:
           input (Variable): Tensor, data type float32, float64.
           dim (list, optional): None for last two dimensions.
           keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
-          out (Variable, optional): The tensor variable storing the output.
         """
         if dim is not None and not (isinstance(dim, list) and len(dim) == 2):
             raise ValueError(
                 "The dim of frobenius norm op should be None or two elements list!"
             )
+        if in_dygraph_mode():
+            if dim is None: dim = [-1]
+            return core.ops.frobenius_norm(input, 'dim', dim, 'keepdim',
+                                           keepdim)
         attrs = {
             'dim': dim if dim != None else [-2, -1],
             'keep_dim': keepdim,
@@ -243,16 +270,8 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
                                  'frobenius_norm')
 
         helper = LayerHelper('frobenius_norm', **locals())
-        if out is None:
-            out = helper.create_variable_for_type_inference(
-                dtype=helper.input_dtype())
-        else:
-            check_type(out, 'out', (Variable), 'frobenius_norm')
-            check_dtype(
-                out.dtype, out.name,
-                convert_dtype(input.dtype), 'frobenius_norm',
-                '(The out data type in frobenius_norm must be the same with input data type.)'
-            )
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
 
         helper.append_op(
             type='frobenius_norm',
@@ -265,7 +284,7 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
                     porder=None,
                     axis=None,
                     keepdim=False,
-                    out=None,
+                    asvector=False,
                     name=None):
         """
         Calculate the p-order vector norm for certain  dimension of Tensor `input`.
@@ -274,32 +293,28 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
           porder (float, optional): None for porder=2.0.
           axis (int, optional): None for last dimension.
           keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
-          out (Variable, optional): The tensor variable storing the output.
         """
+        if in_dygraph_mode():
+            if axis is None: axis = -1
+            return core.ops.p_norm(input, 'porder', porder, 'axis', axis,
+                                   'keepdim', keepdim, 'asvector', asvector)
         if porder is not None:
             check_type(porder, 'porder', (float, int), 'p_norm')
         if axis is not None:
             check_type(axis, 'axis', (int), 'p_norm')
+        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                 'p_norm')
+
         attrs = {
             'axis': axis if axis is not None else -1,
             'porder': float(porder) if porder is not None else 2.0,
             'keepdim': keepdim,
+            'asvector': asvector,
             'epsilon': 1e-12,
         }
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'p_norm')
-
         helper = LayerHelper('p_norm', **locals())
-        if out is None:
-            out = helper.create_variable_for_type_inference(
-                dtype=helper.input_dtype())
-        else:
-            check_type(out, 'out', (Variable), 'p_norm')
-            check_dtype(
-                out.dtype, out.name,
-                convert_dtype(input.dtype), 'p_norm',
-                '(The out data type in p_norm must be the same with input data type.)'
-            )
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
 
         helper.append_op(
             type='p_norm',
@@ -308,21 +323,126 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
             attrs=attrs)
         return out
 
+    def inf_norm(input,
+                 porder=None,
+                 axis=axis,
+                 keepdim=False,
+                 asvector=False,
+                 name=None):
+        helper = LayerHelper('frobenius_norm', **locals())
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+        helper.append_op(type='abs', inputs={'X': input}, outputs={'Out': out})
+        reduce_out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+
+        reduce_all = True if axis == None or axis == [] or asvector == True else False
+        axis = axis if axis != None and axis != [] else [0]
+
+        reduce_type = 'reduce_max' if porder == np.float(
+            'inf') else 'reduce_min'
+        helper.append_op(
+            type=reduce_type,
+            inputs={'X': out},
+            outputs={'Out': reduce_out},
+            attrs={'dim': axis,
+                   'keep_dim': keepdim,
+                   'reduce_all': reduce_all})
+
+        return reduce_out
+
+    def p0_matrix_norm(input, porder=0., axis=axis, keepdim=False, name=None):
+        block = LayerHelper('norm', **locals())
+        out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+
+        cast_out = block.create_variable_for_type_inference(dtype=bool)
+        block.append_op(
+            type='cast',
+            inputs={'X': input},
+            outputs={'Out': cast_out},
+            attrs={
+                'in_dtype': input.dtype,
+                'out_dtype': int(core.VarDesc.VarType.BOOL)
+            })
+        cast_out2 = block.create_variable_for_type_inference(dtype=bool)
+        block.append_op(
+            type='cast',
+            inputs={'X': cast_out},
+            outputs={'Out': cast_out2},
+            attrs={
+                'in_dtype': cast_out.dtype,
+                'out_dtype': int(core.VarDesc.VarType.FP32)
+            })
+        sum_out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+        block.append_op(
+            type='reduce_sum',
+            inputs={'X': cast_out2},
+            outputs={'Out': sum_out},
+            attrs={
+                'dim': axis,
+                'keep_dim': keepdim,
+                'reduce_all': True if axis is None else False
+            })
+        return sum_out
+
+    def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
+        block = LayerHelper('norm', **locals())
+        out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+        abs_out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+        block.append_op(
+            type='abs', inputs={'X': input}, outputs={'Out': abs_out})
+        pow_out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+
+        block.append_op(
+            type='pow',
+            inputs={'X': abs_out},
+            outputs={'Out': pow_out},
+            attrs={'factor': porder})
+        sum_out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+        block.append_op(
+            type='reduce_sum',
+            inputs={'X': pow_out},
+            outputs={'Out': sum_out},
+            attrs={
+                'dim': axis,
+                'keep_dim': keepdim,
+                'reduce_all': True if axis is None else False
+            })
+        porder
+        block.append_op(
+            type='pow',
+            inputs={'X': sum_out},
+            outputs={'Out': out},
+            attrs={'factor': float(1. / porder)})
+        return out
+
     if axis is None and p is not None:
         if isinstance(p, str):
             if p == "fro":
-                return frobenius_norm(
-                    input, dim=axis, keepdim=keepdim, out=out, name=name)
+                return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
             else:
                 raise ValueError(
                     "only valid string values are 'fro', found {}".format(p))
         elif isinstance(p, (int, float)):
             return vector_norm(
-                input, porder=p, axis=axis, keepdim=keepdim, out=out, name=name)
+                x,
+                porder=p,
+                axis=axis,
+                keepdim=keepdim,
+                asvector=True,
+                name=name)
         else:
             raise ValueError("only valid p type is string or float, found {}".
                              format(type(p)))
 
+    if isinstance(axis, tuple):
+        axis = list(axis)
     if isinstance(axis, list) and len(axis) == 1:
         axis = axis[0]
 
@@ -330,7 +450,12 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
     if isinstance(axis, int):
         if isinstance(p, (int, float)):
             return vector_norm(
-                input, axis=axis, porder=p, keepdim=keepdim, out=out, name=name)
+                x,
+                axis=axis,
+                porder=p,
+                keepdim=keepdim,
+                asvector=False,
+                name=name)
         else:
             raise ValueError(
                 "unspport p for p-order vector norm. except float, found {}".
@@ -338,11 +463,14 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
     #calculate matrix norm, where axis is list with two integers
     elif isinstance(axis, list) and len(axis) == 2:
         if p == "fro":
-            return frobenius_norm(
-                input, dim=axis, keepdim=keepdim, out=out, name=name)
+            return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
+        elif p == 0:
+            return p0_matrix_norm(x, axis=axis, keepdim=keepdim, name=name)
+        elif p == np.inf or p == -np.inf:
+            return inf_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name)
         else:
-            raise ValueError(
-                "unspport p for matrix norm, expcept 'fro', found {}".format(p))
+            return p_matrix_norm(
+                x, porder=p, axis=axis, keepdim=keepdim, name=name)
     else:
         raise ValueError(
             "except axis type int or list (length of list <=2), found {}".
@@ -455,11 +583,12 @@ def dot(x, y, name=None):
     This operator calculates inner product for vectors.
    
     .. note::
-       Only support 1-d Tensor(vector).
+       Support 1-d and 2-d Tensor. When it is 2d, the first dimension of this matrix 
+       is the batch dimension, which means that the vectors of multiple batches are dotted. 
 
     Parameters:
-        x(Tensor): 1-D ``Tensor``. Its datatype should be ``float32``, ``float64``, ``int32``, ``int64``
-        y(Tensor): 1-D ``Tensor``. Its datatype soulde be ``float32``, ``float64``, ``int32``, ``int64``
+        x(Tensor): 1-D or 2-D ``Tensor``. Its dtype should be ``float32``, ``float64``, ``int32``, ``int64``
+        y(Tensor): 1-D or 2-D ``Tensor``. Its dtype soulde be ``float32``, ``float64``, ``int32``, ``int64``
         name(str, optional): Name of the output. Default is None. It's used to print debug info for developers. Details: :ref:`api_guide_Name`
 
     Returns:
@@ -470,14 +599,13 @@ def dot(x, y, name=None):
     .. code-block:: python
 
         import paddle
-        import paddle.fluid as fluid
         import numpy as np
 
         paddle.disable_static()
         x_data = np.random.uniform(0.1, 1, [10]).astype(np.float32)
         y_data = np.random.uniform(1, 3, [10]).astype(np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
         z = paddle.dot(x, y)
         print(z.numpy())
 
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 18dbeb0c46e8a3416d7d57f92ffa6064510250b3..36b558d597c1ce1333a8f1eec54e2fd2813625e3 100644
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.data_feeder import check_type
+from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..fluid.layers.layer_function_generator import templatedoc
 from .. import fluid
+from ..fluid.framework import in_dygraph_mode
+from paddle.common_ops_import import *
 
 # TODO: define logic functions of a tensor  
 from ..fluid.layers import is_empty  #DEFINE_ALIAS
@@ -91,75 +93,70 @@ def equal_all(x, y, name=None):
 
 
 @templatedoc()
-def allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
+def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     """
-	:alias_main: paddle.allclose
-	:alias: paddle.allclose,paddle.tensor.allclose,paddle.tensor.logic.allclose
-
     ${comment}
 
     Args:
-        input(inputtype):{input_comment}.
-        other(othertype):{other_comment}.
-        rtol(rtoltype,optional):{rtol_comment}.
-        atol(atoltype,optional):{atol_comment}.
-        equal_nan(equalnantype,optional):{equal_nan_comment}.
-        name(STR, optional): The default value is None.
-                        Normally there is no need for user to set this property.
-                        For more information, please refer to :ref:`api_guide_Name`.
+        x(Tensor): ${input_comment}.
+        y(Tensor): ${other_comment}.
+        rtol(rtoltype, optional): ${rtol_comment}.
+        atol(atoltype, optional): ${atol_comment}.
+        equal_nan(equalnantype, optional): ${equal_nan_comment}.
+        name (str, optional): Name for the operation. For more information, please
+            refer to :ref:`api_guide_Name`. Default: None.
 
     Returns:
-        ${out_comment}.
+        Tensor: ${out_comment}.
+
+    Raises:
+        TypeError: The data type of ``x`` must be one of float32, float64.
+        TypeError: The data type of ``y`` must be one of float32, float64.
+        TypeError: The type of ``rtol`` must be float.
+        TypeError: The type of ``atol`` must be float.
+        TypeError: The type of ``equal_nan`` must be bool.
 
-    Return Type:
-        ${out_type}
-        
     Examples:
         .. code-block:: python
 
           import paddle
-          import paddle.fluid as fluid
           import numpy as np
 
-          use_cuda = fluid.core.is_compiled_with_cuda()
-
-          a = fluid.data(name="a", shape=[2], dtype='float32')
-          b = fluid.data(name="b", shape=[2], dtype='float32')
+          paddle.disable_static()
 
-          result = paddle.allclose(a, b, rtol=1e-05, atol=1e-08,
+          np_x = np.array([10000., 1e-07]).astype("float32")
+          np_y = np.array([10000.1, 1e-08]).astype("float32")
+          x = paddle.to_tensor(np_x)
+          y = paddle.to_tensor(np_y)
+          result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                   equal_nan=False, name="ignore_nan")
-          result_nan = paddle.allclose(a, b, rtol=1e-05, atol=1e-08,
+          np_result1 = result1.numpy()
+          # [False]
+          result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                       equal_nan=True, name="equal_nan")
-
-          place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-          exe = fluid.Executor(place)
-          exe.run(fluid.default_startup_program())
-
-          x = np.array([10000., 1e-07]).astype("float32")
-          y = np.array([10000.1, 1e-08]).astype("float32")
-          result_v, result_nan_v = exe.run(
-              feed={'a': x, 'b': y},
-              fetch_list=[result, result_nan])
-          print(result_v, result_nan_v)
-          # Output: (array([False]), array([False]))
-
-          x = np.array([10000., 1e-08]).astype("float32")
-          y = np.array([10000.1, 1e-09]).astype("float32")
-          result_v, result_nan_v = exe.run(
-              feed={'a': x, 'b': y},
-              fetch_list=[result, result_nan])
-          print(result_v, result_nan_v)
-          # Output: (array([ True]), array([ True]))
-
-          x = np.array([1.0, float('nan')]).astype("float32")
-          y = np.array([1.0, float('nan')]).astype("float32")
-          result_v, result_nan_v = exe.run(
-              feed={'a': x, 'b': y},
-              fetch_list=[result, result_nan])
-          print(result_v, result_nan_v)
-          # Output: (array([False]), array([ True]))
+          np_result2 = result2.numpy()
+          # [False]
+
+          np_x = np.array([1.0, float('nan')]).astype("float32")
+          np_y = np.array([1.0, float('nan')]).astype("float32")
+          x = paddle.to_tensor(np_x)
+          y = paddle.to_tensor(np_y)
+          result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
+                                  equal_nan=False, name="ignore_nan")
+          np_result1 = result1.numpy()
+          # [False]
+          result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
+                                      equal_nan=True, name="equal_nan")
+          np_result2 = result2.numpy()
+          # [True]
     """
 
+    if in_dygraph_mode():
+        return core.ops.allclose(x, y, 'rtol', rtol, 'atol', atol, 'equal_nan',
+                                 equal_nan)
+
+    check_variable_and_dtype(x, "input", ['float32', 'float64'], 'allclose')
+    check_variable_and_dtype(y, "input", ['float32', 'float64'], 'allclose')
     check_type(rtol, 'rtol', float, 'allclose')
     check_type(atol, 'atol', float, 'allclose')
     check_type(equal_nan, 'equal_nan', bool, 'allclose')
@@ -167,7 +164,7 @@ def allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     helper = LayerHelper("allclose", **locals())
     out = helper.create_variable_for_type_inference(dtype='bool')
 
-    inputs = {'Input': input, 'Other': other}
+    inputs = {'Input': x, 'Other': y}
     outputs = {'Out': out}
     attrs = {'rtol': rtol, 'atol': atol, 'equal_nan': equal_nan}
     helper.append_op(
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 2c8157645de29e8d0e5e48f810907b01dd305c4a..845d2cf4d199328bbf8d0e03cd3a7a24a61aafd2 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from ..fluid.layers import core, reshape
+from ..fluid.layers import core
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
@@ -23,15 +23,11 @@ from ..fluid.layers import utils
 import numpy as np
 # TODO: define functions to manipulate a tensor  
 from ..fluid.layers import cast  #DEFINE_ALIAS
-from ..fluid.layers import reshape  #DEFINE_ALIAS
-from ..fluid.layers import scatter  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
 from ..fluid.layers import strided_slice  #DEFINE_ALIAS
 from ..fluid.layers import transpose  #DEFINE_ALIAS
-from ..fluid.layers import unique  #DEFINE_ALIAS
 from ..fluid.layers import unstack  #DEFINE_ALIAS
 
-from ..fluid.layers import gather_nd  #DEFINE_ALIAS
 from ..fluid.layers import scatter_nd_add  #DEFINE_ALIAS
 from ..fluid.layers import scatter_nd  #DEFINE_ALIAS
 from ..fluid.layers import shard_index  #DEFINE_ALIAS
@@ -111,9 +107,9 @@ def concat(x, axis=0, name=None):
                             [14, 15, 16]])
             in3 = np.array([[21, 22],
                             [23, 24]])
-            x1 = paddle.to_variable(in1)
-            x2 = paddle.to_variable(in2)
-            x3 = paddle.to_variable(in3)
+            x1 = paddle.to_tensor(in1)
+            x2 = paddle.to_tensor(in2)
+            x3 = paddle.to_tensor(in3)
             zero = paddle.full(shape=[1], dtype='int32', fill_value=0)
             # When the axis is negative, the real axis is (axis + Rank(x))
             # As follow, axis is -1, Rank(x) is 2, the real axis is 1
@@ -377,7 +373,7 @@ def roll(x, shifts, axis=None, name=None):
         outputs={'Out': out},
         attrs={'axis': axis,
                'shifts': shifts})
-    out = reshape(out, shape=origin_shape, inplace=True)
+    out = layers.reshape(out, shape=origin_shape, inplace=True)
     return out
 
 
@@ -475,9 +471,6 @@ def stack(x, axis=0, name=None):
 
 def split(x, num_or_sections, axis=0, name=None):
     """
-	:alias_main: paddle.split
-        :alias: paddle.tensor.split, paddle.tensor.manipulation.split
-    
     Split the input tensor into multiple sub-Tensors.
     
     Args:
@@ -507,7 +500,7 @@ def split(x, num_or_sections, axis=0, name=None):
             paddle.disable_static()
             # x is a Tensor which shape is [3, 9, 5]
             x_np = np.random.random([3, 9, 5]).astype("int32")
-            x = paddle.to_variable(x_np)
+            x = paddle.to_tensor(x_np)
 
             out0, out1, out22 = paddle.split(x, num_or_sections=3, axis=1)
             # out0.shape [3, 3, 5]
@@ -613,6 +606,130 @@ def squeeze(x, axis=None, name=None):
     return layers.squeeze(x, axis, name)
 
 
+def unique(x,
+           return_index=False,
+           return_inverse=False,
+           return_counts=False,
+           axis=None,
+           dtype="int64",
+           name=None):
+    """
+    Returns the unique elements of `x` in ascending order.
+
+    Args:
+        x(Tensor): The input tensor, it's data type should be float32, float64, int32, int64.
+        return_index(bool, optional): If True, also return the indices of the input tensor that
+            result in the unique Tensor.
+        return_inverse(bool, optional): If True, also return the indices for where elements in
+            the original input ended up in the returned unique tensor.
+        return_counts(bool, optional): If True, also return the counts for each unique element.
+        axis(int, optional): The axis to apply unique. If None, the input will be flattened.
+            Default: None.
+        dtype(np.dtype|str, optional): The date type of `indices` or `inverse` tensor: int32 or int64.
+            Default: int64.
+        name(str, optional): Name for the operation. For more information, please refer to
+            :ref:`api_guide_Name`. Default: None.
+
+    Returns: 
+        tuple: (out, indices, inverse, counts). `out` is the unique tensor for `x`. `indices` is \
+            provided only if `return_index` is True. `inverse` is provided only if `return_inverse` \
+            is True. `counts` is provided only if `return_counts` is True.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+            x_data = np.array([2, 3, 3, 1, 5, 3])
+            x = paddle.to_tensor(x_data)
+            unique = paddle.unique(x)
+            np_unique = unique.numpy() # [1 2 3 5]
+            _, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True)
+            np_indices = indices.numpy() # [3 0 1 4]
+            np_inverse = inverse.numpy() # [1 2 2 0 3 2]
+            np_counts = counts.numpy() # [1 1 3 1]
+
+            x_data = np.array([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
+            x = paddle.to_tensor(x_data)
+            unique = paddle.unique(x)
+            np_unique = unique.numpy() # [0 1 2 3]
+
+            unique = paddle.unique(x, axis=0)
+            np_unique = unique.numpy() 
+            # [[2 1 3]
+            #  [3 0 1]]
+    """
+    if axis is None:
+        axis = []
+    else:
+        axis = [axis]
+    attr_dtype = convert_np_dtype_to_dtype_(dtype)
+    if in_dygraph_mode():
+        out, inverse, indices, counts = core.ops.unique(
+            x, 'dtype', attr_dtype, 'return_index', return_index,
+            'return_inverse', return_inverse, 'return_counts', return_counts,
+            'axis', axis, "is_sorted", True)
+        outs = [out]
+        if return_index:
+            outs.append(indices)
+        if return_inverse:
+            outs.append(inverse)
+        if return_counts:
+            outs.append(counts)
+
+        if len(outs) == 1:
+            return outs[0]
+
+        return tuple(outs)
+
+    check_variable_and_dtype(x, "input",
+                             ['float32', 'float64', 'int32', 'int64'], 'unique')
+    check_type(return_index, 'return_index', bool, 'unique')
+    check_type(return_inverse, 'return_inverse', bool, 'unique')
+    check_type(return_counts, 'return_counts', bool, 'unique')
+    check_dtype(dtype, 'dtype', ['int32', 'int64'], 'unique')
+    if len(axis) != 0:
+        check_type(axis[0], 'axis', int, 'unique')
+
+    helper = LayerHelper('unique', **locals())
+    attrs = {
+        'dtype': attr_dtype,
+        "return_index": return_index,
+        "return_inverse": return_inverse,
+        "return_counts": return_counts,
+        "axis": axis,
+        "is_sorted": True
+    }
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    inverse = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True)
+    outputs = {"Out": out, "Index": inverse}
+    outs = [out]
+    if return_index:
+        indices = helper.create_variable_for_type_inference(
+            dtype=attr_dtype, stop_gradient=True)
+        outputs["Indices"] = indices
+        outs.append(indices)
+    if return_inverse:
+        outs.append(inverse)
+    if return_counts:
+        counts = helper.create_variable_for_type_inference(
+            dtype=attr_dtype, stop_gradient=True)
+        outputs["Counts"] = counts
+        outs.append(counts)
+
+    helper.append_op(
+        type="unique", inputs={"X": x}, attrs=attrs, outputs=outputs)
+
+    if len(outs) == 1:
+        return outs[0]
+
+    return tuple(outs)
+
+
 def unsqueeze(x, axis, name=None):
     """
 	:alias_main: paddle.unsqueeze
@@ -659,50 +776,46 @@ def unsqueeze(x, axis, name=None):
     return layers.unsqueeze(x, axis, name)
 
 
-def gather(input, index, overwrite=True):
+def gather(x, index, axis=None, name=None):
     """
-	:alias_main: paddle.gather
-	:alias: paddle.gather,paddle.tensor.gather,paddle.tensor.manipulation.gather
 
     **Gather Layer**
 
-    Output is obtained by gathering entries of the outer-most dimension
-    of X indexed by `index` and concatenate them together.
-
-    .. math::
-
-        Out = X[Index]
-
+    Output is obtained by gathering entries of ``axis``
+    of ``x`` indexed by ``index`` and concatenate them together.
 
     .. code-block:: text
 
 
                 Given:
 
-                X = [[1, 2],
+                x = [[1, 2],
                      [3, 4],
                      [5, 6]]
 
-                Index = [1, 2]
+                index = [1, 2]
+                axis=[0]
 
                 Then:
 
-                Out = [[3, 4],
+                out = [[3, 4],
                        [5, 6]]
     Args:
-        input (Variable): The source input tensor with rank>=1. Supported data type is
+        x (Tensor): The source input tensor with rank>=1. Supported data type is
             int32, int64, float32, float64 and uint8 (only for CPU),
             float16 (only for GPU).
-        index (Variable): The index input tensor with rank=1. Data type is int32 or int64.
-        overwrite (bool, optional): The mode that updating the grad when has same index.
-            If True, use the overwrite mode to update the grad of the same index,
-            if False, use the accumulate mode to update the grad of the same index.
-            Default value is True.
-
-
+        index (Tensor): The index input tensor with rank=1. Data type is int32 or int64.
+        axis (Tensor|int, optional): The axis of input to be gathered, it's can be int or a Tensor with data type is int32 or int64. The default value is None, if None, the ``axis`` is 0.
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        output (Variable): The output is a tensor with the same rank as input.
+        output (Tensor): The output is a tensor with the same rank as ``x``.
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must to be one of float16, float32, float64, int32, int64, uint8.
+        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
+        TypeError: ``axis`` must be a Tensor or int and the data type of ``index`` must be int32 or int64 when it's a Tensor.
 
     Examples:
 
@@ -710,26 +823,41 @@ def gather(input, index, overwrite=True):
 
             import numpy as np
             import paddle
-            import paddle.fluid as fluid
-
 
-            with fluid.dygraph.guard():
-                input_1 = np.array([[1,2],[3,4],[5,6]])
-                index_1 = np.array([0,1])
-                input = fluid.dygraph.to_variable(input_1)
-                index = fluid.dygraph.to_variable(index_1)
-                output = paddle.gather(input, index)
-                # expected output: [[1,2],[3,4]]
+            paddle.disable_static()
+            input_1 = np.array([[1,2],[3,4],[5,6]])
+            index_1 = np.array([0,1])
+            input = paddle.to_tensor(input_1)
+            index = paddle.to_tensor(index_1)
+            output = paddle.gather(input, index, axis=0)
+            # expected output: [[1,2],[3,4]]
     """
+    if axis is None:
+        axis = 0
+    axis_tensor = axis
+    if not isinstance(axis, Variable):
+        axis_tensor = fill_constant(shape=[1], dtype='int64', value=axis)
+    if in_dygraph_mode():
+        return core.ops.gather(x, index, axis_tensor)
+
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        'gather')
+    check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
+    if isinstance(axis, Variable):
+        check_variable_and_dtype(axis, 'axis', ['int32', 'int64'], 'gather')
+    else:
+        check_type(axis, 'axis', (int), 'gather')
+
     helper = LayerHelper('gather', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="gather",
-        inputs={"X": input,
-                "Index": index},
-        outputs={"Out": out},
-        attrs={'overwrite': overwrite})
+        inputs={"X": x,
+                "Index": index,
+                "Axis": axis_tensor},
+        outputs={"Out": out})
     return out
 
 
@@ -790,6 +918,100 @@ def unbind(input, axis=0):
     return outs
 
 
+def scatter(x, index, updates, overwrite=True, name=None):
+    """
+    **Scatter Layer**
+    Output is obtained by updating the input on selected indices based on updates.
+    
+    .. code-block:: python
+        import numpy as np
+        #input:
+        x = np.array([[1, 1], [2, 2], [3, 3]])
+        index = np.array([2, 1, 0, 1])
+        # shape of updates should be the same as x
+        # shape of updates with dim > 1 should be the same as input
+        updates = np.array([[1, 1], [2, 2], [3, 3], [4, 4]])
+        overwrite = False
+        # calculation:
+        if not overwrite:
+            for i in range(len(index)):
+                x[index[i]] = np.zeros((2))
+        for i in range(len(index)):
+            if (overwrite):
+                x[index[i]] = updates[i]
+            else:
+                x[index[i]] += updates[i]
+        # output:
+        out = np.array([[3, 3], [6, 6], [1, 1]])
+        out.shape # [3, 2]
+
+    **NOTICE**: The order in which updates are applied is nondeterministic, 
+    so the output will be nondeterministic if index contains duplicates.
+
+    Args:
+        x (Tensor): The input N-D Tensor with ndim>=1. Data type can be float32, float64.
+        index (Tensor): The index 1-D Tensor. Data type can be int32, int64. The length of index cannot exceed updates's length, and the value in index cannot exceed input's length.
+        updates (Tensor): update input with updates parameter based on index. shape should be the same as input, and dim value with dim > 1 should be the same as input.
+        overwrite (bool): The mode that updating the output when there are same indices. 
+          If True, use the overwrite mode to update the output of the same index,
+	      if False, use the accumulate mode to update the output of the same index.Default value is True.
+        name(str, optional): The default value is None. Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
+ 
+    Returns:
+        Tensor: The output is a Tensor with the same shape as x.
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+
+            x_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float32)
+            index_data = np.array([2, 1, 0, 1]).astype(np.int64)
+            updates_data = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float32)
+            
+            x = paddle.to_tensor(x_data)
+            index = paddle.to_tensor(index_data)
+            updates = paddle.to_tensor(updates_data)
+  
+            output1 = paddle.scatter(x, index, updates, overwrite=False)
+            # [[3., 3.],
+            #  [6., 6.],
+            #  [1., 1.]]
+
+            output2 = paddle.scatter(x, index, updates, overwrite=True)
+            # CPU device:
+            # [[3., 3.],
+            #  [4., 4.],
+            #  [1., 1.]]
+            # GPU device maybe have two results because of the repeated numbers in index
+            # result 1:
+            # [[3., 3.],
+            #  [4., 4.],
+            #  [1., 1.]]
+            # result 2:
+            # [[3., 3.],
+            #  [2., 2.],
+            #  [1., 1.]]
+    """
+    if in_dygraph_mode():
+        return core.ops.scatter(x, index, updates, 'overwrite', overwrite)
+
+    check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'scatter')
+    check_type(overwrite, 'overwrite', bool, 'scatter')
+    helper = LayerHelper('scatter', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type="scatter",
+        inputs={"X": x,
+                "Ids": index,
+                "Updates": updates},
+        attrs={'overwrite': overwrite},
+        outputs={"Out": out})
+    return out
+
+
 def chunk(x, chunks, axis=0, name=None):
     """
     Split the input tensor into multiple sub-Tensors.
@@ -817,7 +1039,7 @@ def chunk(x, chunks, axis=0, name=None):
             paddle.disable_static()
             # x is a Tensor which shape is [3, 9, 5]
             x_np = np.random.random([3, 9, 5]).astype("int32")
-            x = paddle.to_variable(x_np)
+            x = paddle.to_tensor(x_np)
 
             out0, out1, out22 = paddle.chunk(x, chunks=3, axis=1)
             # out0.shape [3, 3, 5]
@@ -841,7 +1063,7 @@ def tile(x, repeat_times, name=None):
     """
 
     Construct a new Tensor by repeating ``x`` the number of times given by ``repeat_times``.
-    After tiling, the number of elements of the i'th dimension of the output is equal to ``x.dims[i] * repeat_times[i]``.
+    After tiling, the value of the i'th dimension of the output is equal to ``x.shape[i]*repeat_times[i]``.
 
     Both the number of dimensions of ``x`` and the number of elements in ``repeat_times`` should be less than or equal to 6.
 
@@ -862,21 +1084,24 @@ def tile(x, repeat_times, name=None):
 
             paddle.disable_static()
             np_data = np.array([1, 2, 3]).astype('int32')
-            data = paddle.to_variable(np_data)
+            data = paddle.to_tensor(np_data)
             out = paddle.tile(data, repeat_times=[2, 1])
-			np_out = out1.numpy()
+            np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
 
             out = paddle.tile(data, repeat_times=[2, 2])
-			np_out = out.numpy()
+            np_out = out.numpy()
             # [[1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3]]
 
             np_repeat_times = np.array([2, 1]).astype("int32")
-            repeat_times = paddle.to_variable(np_repeat_times)
+            repeat_times = paddle.to_tensor(np_repeat_times)
             out = paddle.tile(data, repeat_times=repeat_times)
-			np_out = out.numpy()
+            np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
+    if in_dygraph_mode():
+        return core.ops.tile(x, 'repeat_times', repeat_times)
+
     check_variable_and_dtype(
         x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'tile')
     check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
@@ -884,9 +1109,9 @@ def tile(x, repeat_times, name=None):
         raise ValueError(
             "When the date type is bool for the input 'x' of tile op, you "
             "must set its stop_gradient to be True by "
-            "some_var.stop_gradient == True supporting some_var as the input.")
+            "some_var.stop_gradient == True supporting some_var is the input.")
 
-    helper = LayerHelper('tile', input=x, **locals())
+    helper = LayerHelper('tile', **locals())
 
     inputs = {"X": [x]}
     attrs = {}
@@ -928,7 +1153,7 @@ def expand_as(x, y, name=None):
 
     Args:
         x (Tensor): The input tensor, its data type is bool, float32, float64, int32 or int64.
-        y (Tensor): The input tensor gives the shape that ``x`` to expand to.
+        y (Tensor): The input tensor that gives the shape to expand to.
         name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -942,14 +1167,17 @@ def expand_as(x, y, name=None):
 
             paddle.disable_static()
 
-            np_data_x = np.array([1, 2, 3]).astype=('int32)
-            np_data_y = np.array([[1, 2, 3], [4, 5, 6]]).astype=('int32)
-            data_x = paddle.to_variable(np_data_x)
-            data_y = paddle.to_variable(np_data_y)
+            np_data_x = np.array([1, 2, 3]).astype('int32')
+            np_data_y = np.array([[1, 2, 3], [4, 5, 6]]).astype('int32')
+            data_x = paddle.to_tensor(np_data_x)
+            data_y = paddle.to_tensor(np_data_y)
             out = paddle.expand_as(data_x, data_y)
-			np_out = out.numpy()
+            np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
+    if in_dygraph_mode():
+        return core.ops.expand_as_v2(x, y)
+
     check_variable_and_dtype(
         x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand_as')
     check_type(y, 'y', Variable, 'expand_as')
@@ -962,7 +1190,7 @@ def expand_as(x, y, name=None):
             "some_var as the input 'x'.")
     inputs = {"X": [x], "target_tensor": [y]}
 
-    helper = LayerHelper('expand_as', input=x, **locals())
+    helper = LayerHelper('expand_as', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(type='expand_as_v2', inputs=inputs, outputs={'Out': out})
@@ -994,18 +1222,15 @@ def expand(x, shape, name=None):
             import paddle
 
             paddle.disable_static()
-            np_data = np.array([1, 2, 3]).astype=('int32)
-            data = paddle.to_variable(np_data)
+            np_data = np.array([1, 2, 3]).astype('int32')
+            data = paddle.to_tensor(np_data)
             out = paddle.expand(data, shape=[2, 3])
-			out = out.numpy()
-            # [[1, 2, 3], [1, 2, 3]]
-
-            np_shape = np.array([2, 3]).astype=('int32)
-            shape = paddle.to_variable(np_shape)
-            out = paddle.expand(data, shape=shape)
-			out = out.numpy
+            out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
+    if in_dygraph_mode():
+        return core.ops.expand_v2(x, 'shape', shape)
+
     check_variable_and_dtype(
         x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand')
     check_type(shape, 'shape', (list, tuple, Variable), 'expand')
@@ -1018,7 +1243,7 @@ def expand(x, shape, name=None):
                          "some_var.stop_gradient = True, supporting "
                          "some_var as the input.")
 
-    helper = LayerHelper('expand', input=x, **locals())
+    helper = LayerHelper('expand', **locals())
 
     def get_attr_expand_shape(list_expand_shape):
         attrs_expand_shape = []
@@ -1048,3 +1273,166 @@ def expand(x, shape, name=None):
 
 
 broadcast_to = expand
+
+
+def reshape(x, shape, name=None):
+    """
+    :alias_main: paddle.reshape
+	:alias: paddle.reshape,paddle.tensor.reshape,paddle.tensor.manipulation.reshape
+
+    This operator changes the shape of ``x`` without changing its data.
+
+    Some tricks exist when specifying the target shape.
+
+    1. -1 means the value of this dimension is inferred from the total element
+    number of x and remaining dimensions. Thus one and only one dimension can
+    be set -1.
+
+    2. 0 means the actual dimension value is going to be copied from the
+    corresponding dimension of x. The index of 0s in shape can not exceed
+    the dimension of x.
+
+    Here are some examples to explain it.
+
+    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with
+    shape [6, 8] and leaving x's data unchanged.
+
+    2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    specified is [2, 3, -1, 2], the reshape operator will transform x into a
+    4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
+    case, one dimension of the target shape is set to -1, the value of this
+    dimension is inferred from the total element number of x and remaining
+    dimensions.
+
+    3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor
+    with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case,
+    besides -1, 0 means the actual dimension value is going to be copied from
+    the corresponding dimension of x.
+
+    Args:
+        x(Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
+        shape(list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1.
+                        The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
+                        If ``shape`` is an Tensor, it should be an 1-D Tensor .
+        name(str, optional): The default value is None. Normally there is no need for user to set this property.
+                            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor: A reshaped Tensor with the same data type as ``x``.
+
+    Raises:
+        ValueError: If more than one elements of ``shape`` is -1.
+        ValueError: If the element of ``shape`` is 0, the corresponding dimension should be less than or equal to the dimension of ``x``.
+        ValueError: If the elements in ``shape`` is negative except -1.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+
+            data = np.random.random([2, 4, 6]).astype("float32")
+            x = paddle.to_tensor(data)
+
+            positive_four = paddle.fill_constant([1], "int32", 4)
+
+            out_1 = paddle.reshape(x, [-1, 0, 3, 2])
+            # the shape of out_1 is [2,4,3,2].
+
+            out_2 = paddle.reshape(x, shape=[positive_four, 12])
+            # the shape of out_2 is [4, 12].
+
+            shape_tensor = paddle.to_tensor(np.array([8, 6]).astype("int32"))
+            out_3 = paddle.reshape(x, shape=shape_tensor)
+            # the shape of out_2 is [8, 6].
+    """
+    return paddle.fluid.layers.reshape(x=x, shape=shape, name=name)
+
+
+def gather_nd(x, index, name=None):
+    """
+
+    This function is actually a high-dimensional extension of :code:`gather`
+    and supports for simultaneous indexing by multiple axes. :attr:`index` is a
+    K-dimensional integer tensor, which is regarded as a (K-1)-dimensional
+    tensor of :attr:`index` into :attr:`input`, where each element defines
+    a slice of params:
+
+    .. math::
+
+        output[(i_0, ..., i_{K-2})] = input[index[(i_0, ..., i_{K-2})]]
+
+    Obviously, :code:`index.shape[-1] <= input.rank` . And, the output tensor has
+    shape :code:`index.shape[:-1] + input.shape[index.shape[-1]:]` .
+
+    .. code-block:: text
+
+            Given:
+                x =  [[[ 0,  1,  2,  3],
+                       [ 4,  5,  6,  7],
+                       [ 8,  9, 10, 11]],
+                      [[12, 13, 14, 15],
+                       [16, 17, 18, 19],
+                       [20, 21, 22, 23]]]
+                x.shape = (2, 3, 4)
+
+            * Case 1:
+                index = [[1]]
+
+                gather_nd(x, index)
+                         = [x[1, :, :]]
+                         = [[12, 13, 14, 15],
+                            [16, 17, 18, 19],
+                            [20, 21, 22, 23]]
+
+            * Case 2:
+                index = [[0,2]]
+
+                gather_nd(x, index)
+                         = [x[0, 2, :]]
+                         = [8, 9, 10, 11]
+
+            * Case 3:
+                index = [[1, 2, 3]]
+
+                gather_nd(x, index)
+                         = [x[1, 2, 3]]
+                         = [23]
+
+    Args:
+        x (Tensor): The input Tensor which it's data type should be bool, float32, float64, int32, int64.
+        index (Tensor): The index input with rank > 1, index.shape[-1] <= input.rank.
+                        Its dtype should be int32, int64.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+                        For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of float32, float64, int32 and int64.
+        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be one of int32 and int64.
+
+    Examples:
+
+        .. code-block:: python
+            
+            import paddle
+            import numpy as np
+            
+            paddle.disable_static()
+            np_x = np.array([[[1, 2], [3, 4], [5, 6]],
+                             [[7, 8], [9, 10], [11, 12]]])
+            np_index = [[0, 1]]
+            x = paddle.to_tensor(np_x)
+            index = paddle.to_tensor(np_index)
+            
+            output = paddle.gather_nd(x, index) #[[3, 4]]
+
+    """
+
+    return paddle.fluid.layers.gather_nd(input=x, index=index, name=name)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
old mode 100644
new mode 100755
index 14084b8295bbc6ba40c31d6cf838855ce0e4b77f..0d87c1c2cf705372de7b8534cf8faea1bb5320a6
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -15,14 +15,16 @@
 math functions
 """
 from __future__ import print_function
+import numpy as np
 
 from paddle.common_ops_import import *
+from paddle.tensor import cast
+import paddle
 from ..fluid import layers
 from ..fluid.framework import core, _varbase_creator, in_dygraph_mode, Variable
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from ..fluid.layers.layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn
-import sys
 
 # TODO: define math functions
 # yapf: disable
@@ -51,18 +53,19 @@ from ..fluid.layers import reduce_sum    #DEFINE_ALIAS
 from ..fluid.layers import round    #DEFINE_ALIAS
 from ..fluid.layers import rsqrt    #DEFINE_ALIAS
 from ..fluid.layers import scale    #DEFINE_ALIAS
-from ..fluid.layers import sign    #DEFINE_ALIAS
 from ..fluid.layers import square    #DEFINE_ALIAS
 from ..fluid.layers import stanh    #DEFINE_ALIAS
 from ..fluid.layers import atan    #DEFINE_ALIAS
 from ..fluid.layers import erf    #DEFINE_ALIAS
 from ..fluid.layers import sqrt    #DEFINE_ALIAS
 from ..fluid.layers import sin    #DEFINE_ALIAS
-from ..fluid.layers import tanh    #DEFINE_ALIAS
 
 from ..fluid.layers import increment    #DEFINE_ALIAS
 from ..fluid.layers import multiplex    #DEFINE_ALIAS
 from ..fluid.layers import sums    #DEFINE_ALIAS
+from ..fluid import layers
+import paddle
+
 
 __all__ = [
         'abs',
@@ -83,9 +86,11 @@ __all__ = [
         'floor',
         'increment',
         'log',
+        'logsumexp',
         'mul',
         'multiplex',
         'pow',
+        'prod',
         'reciprocal',
         'reduce_max',
         'reduce_min',
@@ -109,7 +114,11 @@ __all__ = [
         'min',
         'minimum',
         'mm',
-        'div',
+        'divide',
+        'floor_divide',
+        'remainder',
+        'mod',
+        'floor_mod',
         'multiply',
         'add',
         'atan',
@@ -119,70 +128,109 @@ __all__ = [
         'erf',
         'addcmul',
         'addmm',
-        'clamp',
+        'clip',
         'trace',
-        'kron'
+        'kron',
+        'isfinite',
+        'isinf',
+        'isnan'
 ]
 # yapf: enable.
 
-@templatedoc()
-def pow(input, exponent, name=None):
+_supported_int_dtype_ = [
+    VarDesc.VarType.UINT8,
+    VarDesc.VarType.INT8,
+    VarDesc.VarType.INT16,
+    VarDesc.VarType.INT32,
+    VarDesc.VarType.INT64,
+]
+
+_supported_float_dtype_ = [
+    VarDesc.VarType.FP32,
+    VarDesc.VarType.FP64,
+]
+
+def pow(x, y, name=None):
     """
-	:alias_main: paddle.pow
-	:alias: paddle.pow,paddle.tensor.pow,paddle.tensor.math.pow
+    Compute the power of tensor elements. The equation is:
 
-    This is Pow Activation Operator.
+    .. math::
+        out = x^{y} 
 
-    :math:`out = input^{exponent}`
+    **Note**:
+    ``paddle.pow`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
-    Args:
-        input(Variable): A ``Tensor`` or ``LoDTensor`` . The data type is ``float32`` or ``float64``.
-        exponent(float32|Variable): A scalar with type ``float32`` or a ``Tensor`` with shape [1] and type ``float32``.
-        name(str, optional): The default value is None. Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
 
+    Args:
+        x (Tensor): An N-D Tensor, the data type is float32, float64, int32 or int64.
+        y (Tensor): An N-D Tensor with type float32, float64, int32 or int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+    
     Returns:
-        Variable: A ``Tensor`` or ``LoDTensor``. The data type is same as ``input``.
+        N-D Tensor. A location into which the result is stored. Its dimension equals with $x$.
 
     Examples:
 
-        .. code-block:: python
+        ..  code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-
-            x = fluid.data(name="x", shape=[32,32], dtype="float32")
+            import numpy as np
 
-            # example 1: argument exponent is float
-            y_1 = paddle.pow(x, 2.0)
-            # y_1 is x^{2.0}
+            paddle.disable_static()
+            
+            # example 1: y is a float
+            x_data = np.array([1, 2, 3])
+            y = 2
+            x = paddle.to_tensor(x_data)
+            res = paddle.pow(x, y)
+            print(res.numpy()) # [1 4 9]
+            
+            # example 2: y is a Tensor
+            y = paddle.fill_constant(shape=[1], value=2, dtype='float32')
+            res = paddle.pow(x, y)
+            print(res.numpy()) # [1 4 9]
 
-            # example 2: argument exponent is Variable
-            exponent_tensor = fluid.layers.fill_constant([1], "float32", 3.0)
-            y_2 = paddle.pow(x, exponent_tensor)
-            # y_2 is x^{3.0}
     """
+    # in dynamic graph mode
     if in_dygraph_mode():
-        return core.ops.pow(input, "exponent", exponent)
-
-    helper = LayerHelper('pow', **locals())
-    inputs = {'X': input}
-    attrs = {}
-    if isinstance(exponent, Variable):
-        exponent.stop_gradient = True
-        inputs['FactorTensor'] = exponent
+        if isinstance(y, (int, float)):
+            return core.ops.pow(x, 'factor', y)
+        elif isinstance(y, (paddle.Tensor, Variable)):
+
+            if x.dtype != y.dtype:
+                y = cast(y, dtype='float64')
+                x = cast(x, dtype='float64')
+                out_dygraph = _elementwise_op_in_dygraph(
+                x, y, axis=-1, act=None, op_name='elementwise_pow')
+                return out_dygraph
+
+            return _elementwise_op_in_dygraph(
+                x, y, axis=-1, act=None, op_name='elementwise_pow')
+        else:
+            raise TypeError('y must be scalar or tensor type, but received: %s '% (y.dtype))
+    # in static graph mode
     else:
-        attrs['factor'] = exponent
-
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    check_dtype(
-        out.dtype, out.name,
-        convert_dtype(input.dtype), 'pow',
-        '(The out data type in pow must be the same with input data type.)')
+        if isinstance(y, (int, float)):
+            helper = LayerHelper('pow', **locals())
+            inputs = {'X': x}
+            attrs = {'factor': y}
+            out = helper.create_variable_for_type_inference(dtype=x.dtype)
+            helper.append_op(
+                type='pow', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+            return out
+        elif isinstance(y, (paddle.Tensor, Variable)):
+            # TODO A potential speed improvement is supporting different types in C++ and removing the cast ops here
+            helper = LayerHelper('elementwise_pow', **locals())
+            if x.dtype != y.dtype:
+                y = cast(y, dtype='float64')
+                x = cast(x, dtype='float64')
+                out = helper.create_variable_for_type_inference(dtype=x.dtype)
+            else:
+                out = helper.create_variable_for_type_inference(dtype=x.dtype)
+            return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
+        else:
+            raise TypeError('y must be scalar or tensor type, but received: %s '% (type(y)))
 
-    helper.append_op(
-        type='pow', inputs=inputs, outputs={'Out': out}, attrs=attrs)
-    return out
 
 
 @dygraph_only
@@ -205,6 +253,8 @@ def _elementwise_op(helper):
     x = helper.kwargs.get('x', None)
     y = helper.kwargs.get('y', None)
 
+    out = helper.kwargs.get('out', None)
+
     assert x is not None, 'x cannot be None in {}'.format(original_op_type)
     assert y is not None, 'y cannot be None in {}'.format(original_op_type)
     check_variable_and_dtype(
@@ -217,11 +267,12 @@ def _elementwise_op(helper):
     axis = helper.kwargs.get('axis', -1)
     use_mkldnn = helper.kwargs.get('use_mkldnn', False)
     name = helper.kwargs.get('name', None)
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
+
+    if out is None:
+        if name is None:
+            out = helper.create_variable_for_type_inference(dtype=x.dtype)
+        else:
+            out = helper.create_variable(name=name, dtype=x.dtype, persistable=False)
 
     helper.append_op(
         type=op_type,
@@ -261,144 +312,331 @@ Examples:
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
-def div(x, y, name=None):
+def divide(x, y, name=None):
     """
-Examples:
+    Divide two tensors element-wise. The equation is:
 
-    .. code-block:: python
+    .. math::
+        out = x / y
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    **Note**:
+    ``paddle.divide`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
+    Args:
+        x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = paddle.div(x, y)
-        # z = x / y
+    Returns:
+        N-D Tensor. A location into which the result is stored. It's dimension equals with $x$.
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+    Examples:
 
-        print(z_value) # [2., 0.6, 2.]
+        ..  code-block:: python
 
+            import paddle
+            import numpy as np
 
-    .. code-block:: python
+            paddle.disable_static()
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+            np_x = np.array([2, 3, 4]).astype('float64')
+            np_y = np.array([1, 5, 2]).astype('float64')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.divide(x, y)
+            print(z.numpy())  # [2., 0.6, 2.]
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((4, 5)).astype('float32')
-            }
+    """
+    op_type = 'elementwise_div'
+    axis = -1
+    act = None
+    if in_dygraph_mode():
+        # rule 1 : avoid numpy.ndarray
+        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+            raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+        # rule 2: both the inputs are not Tensor
+        elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
+            x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
+            y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
+
+        # rule 3: both the inputs are Tensor
+        elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
+            if y.dtype != x.dtype:
+                raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
+                                "But x is {}, y is {}".format(x.dtype, y.dtype))
+            elif x.dtype in _supported_int_dtype_:
+                x = x.astype(paddle.get_default_dtype())
+                y = y.astype(paddle.get_default_dtype())
+
+        # rule 4: x is Tensor, y is scalar
+        elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
+            if x.dtype in _supported_int_dtype_:
+                x = x.astype(paddle.get_default_dtype())
+            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
+
+        # rule 5: x is scalar, y is Tensor
+        elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
+            if y.dtype in _supported_int_dtype_:
+                y = y.astype(paddle.get_default_dtype())
+            x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
 
-        x = fluid.data(name="x", shape=[2, 3, 4, 5], dtype='float32')
-        y = fluid.data(name="y", shape=[4, 5], dtype='float32')
-        z = paddle.div(x, y, name='z')
-        # z = x / y
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name=op_type)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+    # rule 1 : avoid numpy.ndarray
+    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+        raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+    # rule 2: both the inputs are not Tensor
+    elif not isinstance(x, Variable) and not isinstance(y, Variable):
+        x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
+        y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
+
+    # rule 3: both the inputs are Tensor
+    elif isinstance(x, Variable) and isinstance(y, Variable):
+        if y.dtype != x.dtype:
+            raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
+                            "But x is {}, y is {}".format(x.dtype, y.dtype))
+        elif x.dtype in _supported_int_dtype_:
+            x = paddle.cast(x, paddle.get_default_dtype())
+            y = paddle.cast(y, paddle.get_default_dtype())
+
+    # rule 4: x is Tensor, y is scalar
+    elif isinstance(x, Variable) and not isinstance(y, Variable):
+        if x.dtype in _supported_int_dtype_:
+            x = paddle.cast(x, paddle.get_default_dtype())
+        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
+
+    # rule 5: x is scalar, y is Tensor
+    elif not isinstance(x, Variable) and isinstance(y, Variable):
+        if y.dtype in _supported_int_dtype_:
+            y = paddle.cast(y, paddle.get_default_dtype())
+        x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+    return _elementwise_op(LayerHelper(op_type, **locals()))
 
-        print(z_value[0])
-        print(z_value[0].shape) # z.shape=[2,3,4,5]
 
+def floor_divide(x, y, name=None):
+    """
+    Floor divide two tensors element-wise. The equation is:
 
-    ..  code-block:: python
+    .. math::
+        out = x // y
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    **Note**:
+    ``paddle.floor_divide`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
-        def gen_data():
-            return {
-                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
-                "y": np.random.randint(1, 5, size=[5]).astype('float32')
-            }
+    Args:
+        x (Tensor): the input tensor, it's data type should be int32, int64.
+        y (Tensor): the input tensor, it's data type should be int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[5], dtype='float32')
-        z = paddle.div(x, y)
-        # z = x / y
+    Returns:
+        N-D Tensor. A location into which the result is stored. It's dimension equals with $x$.
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+    Examples:
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
-        print(z_value[0])
-        print(z_value[0].shape) # z.shape=[2,3,4,5]
+        ..  code-block:: python
 
+            import paddle
+            import numpy as np
 
-    ..  code-block:: python
+            paddle.disable_static()
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+            np_x = np.array([2, 3, 8, 7])
+            np_y = np.array([1, 5, 3, 3])
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.floor_divide(x, y)
+            print(z.numpy())  # [2, 0, 2, 2]
 
-        with fluid.dygraph.guard(fluid.CPUPlace()):
-            np_x = np.array([2, 3, 4]).astype('float64')
-            np_y = np.array([1, 5, 2]).astype('float64')
-            x = fluid.dygraph.to_variable(np_x)
-            y = fluid.dygraph.to_variable(np_y)
-            z = paddle.div(x, y)
-            np_z = z.numpy()
-            print(np_z)  # [2., 0.6, 2.]
+    """
+    op_type = 'elementwise_floordiv'
+    axis = -1
+    if in_dygraph_mode():
+        # rule 1 : avoid numpy.ndarray
+        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+            raise TypeError("floor_divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+        # rule 2: both the inputs are not Tensor
+        elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
+            x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
+            y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
 
+        # rule 3: both the inputs are Tensor
+        elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
+            if y.dtype != x.dtype:
+                raise TypeError("floor_divide(): argument position 1 and argument position 2 must have the same dtype."
+                                "But x is {}, y is {}".format(x.dtype, y.dtype))
+
+        # rule 4: x is Tensor, y is scalar
+        elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
+            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
+
+        # rule 5: x is scalar, y is Tensor
+        elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
+            x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
+
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, op_name=op_type)
+
+    # rule 1 : avoid numpy.ndarray
+    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+        raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+    # rule 2: both the inputs are not Tensor
+    elif not isinstance(x, Variable) and not isinstance(y, Variable):
+        x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
+        y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
+
+    # rule 3: both the inputs are Tensor
+    elif isinstance(x, Variable) and isinstance(y, Variable):
+        if y.dtype != x.dtype:
+            raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
+                            "But x is {}, y is {}".format(x.dtype, y.dtype))
+
+    # rule 4: x is Tensor, y is scalar
+    elif isinstance(x, Variable) and not isinstance(y, Variable):
+        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
+
+    # rule 5: x is scalar, y is Tensor
+    elif not isinstance(x, Variable) and isinstance(y, Variable):
+        x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
+
+    return _elementwise_op(LayerHelper(op_type, **locals()))
+
+
+def remainder(x, y, name=None):
     """
-    op_type = 'elementwise_div'
+    Mod two tensors element-wise. The equation is:
+
+    .. math::
+        out = x \% y
+
+    **Note**:
+    ``paddle.remainder`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
+
+    Args:
+        x (Tensor): the input tensor, it's data type should be int32, int64.
+        y (Tensor): the input tensor, it's data type should be int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        N-D Tensor. A location into which the result is stored. It's dimension equals with $x$.
+
+    Examples:
+
+        ..  code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            np_x = np.array([2, 3, 8, 7])
+            np_y = np.array([1, 5, 3, 3])
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.remainder(x, y)
+            print(z.numpy())  # [0, 3, 2, 1]
+
+    """
+    op_type = 'elementwise_mod'
     axis = -1
-    act = None
     if in_dygraph_mode():
+        # rule 1 : avoid numpy.ndarray
+        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+            raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+        elif not isinstance(x, paddle.Tensor):
+            raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
+
+        # rule 3: both the inputs are Tensor
+        elif isinstance(y, paddle.Tensor):
+            if y.dtype != x.dtype:
+                raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
+                                "But x is {}, y is {}".format(x.dtype, y.dtype))
+
+        # rule 4: x is Tensor, y is scalar
+        elif not isinstance(y, paddle.Tensor):
+            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
+
         return _elementwise_op_in_dygraph(
-            x, y, axis=axis, act=act, op_name=op_type)
+            x, y, axis=axis, op_name=op_type)
+
+    # rule 1 : avoid numpy.ndarray
+    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+        raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+    elif not isinstance(x, Variable):
+        raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
+
+    # rule 3: both the inputs are Tensor
+    elif isinstance(y, Variable):
+        if y.dtype != x.dtype:
+            raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
+                            "But x is {}, y is {}".format(x.dtype, y.dtype))
+
+    # rule 4: x is Tensor, y is scalar
+    elif not isinstance(y, paddle.Tensor):
+        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
 
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
+mod = remainder  #DEFINE_ALIAS
+floor_mod = remainder  #DEFINE_ALIAS
+
+
 def multiply(x, y, axis=-1, name=None):
     """
-	:alias_main: paddle.multiply
-	:alias: paddle.multiply,paddle.tensor.multiply,paddle.tensor.math.multiply
+    multiply two tensors element-wise. The equation is:
 
-Examples:
+    .. math::
+        out = x * y
 
-    .. code-block:: python
+    **Note**:
+    ``paddle.multiply`` supports broadcasting. If you would like to know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
-        import paddle
-        import numpy as np
+    Args:
+        x (Tensor): the input tensor, its data type should be float32, float64, int32, int64.
+        y (Tensor): the input tensor, its data type should be float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-        paddle.disable_static()
-        x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
-        y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
-        res = paddle.multiply(x, y)
-        print(res.numpy()) # [[5, 12], [21, 32]]
+    Returns:
+        N-D Tensor. A location into which the result is stored. Its dimension equals with $x$.
 
-        x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
-        y_data = np.array([1, 2], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
-        res = paddle.multiply(x, y, axis=1)
-        print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
+    Examples:
+
+        ..  code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
+            y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
+            res = paddle.multiply(x, y)
+            print(res.numpy()) # [[5, 12], [21, 32]]
+
+            x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
+            y_data = np.array([1, 2], dtype=np.float32)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
+            res = paddle.multiply(x, y, axis=1)
+            print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
 
     """
     op_type = 'elementwise_mul'
     act = None
+    if x.dtype != y.dtype:
+        raise TypeError(
+            'Input tensors must be same type, but received type of x: %s, type of y: %s '
+            % (x.dtype, y.dtype))
+
     if in_dygraph_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
@@ -510,7 +748,6 @@ Examples:
 
 for func in [
         add,
-        div,
         maximum,
         minimum,
         multiply
@@ -532,75 +769,84 @@ for func in [
         }) + """\n""" + str(func.__doc__)
 
 
-def sum(input, dim=None, dtype=None, keep_dim=False, name=None):
+def sum(x, axis=None, dtype=None, keepdim=False, name=None):
     """
-	:alias_main: paddle.sum
-	:alias: paddle.sum,paddle.tensor.sum,paddle.tensor.math.sum
-
     Computes the sum of tensor elements over the given dimension.
 
     Args:
-        input (Variable): The input variable which is a Tensor, the data type is float32,
-            float64, int32, int64.
-        dim (list|int, optional): The dimensions along which the sum is performed. If
-            :attr:`None`, sum all elements of :attr:`input` and return a
+        x (Tensor): An N-D Tensor, the data type is float32, float64, int32 or int64.
+        axis (int|list|tuple, optional): The dimensions along which the sum is performed. If
+            :attr:`None`, sum all elements of :attr:`x` and return a
             Tensor variable with a single element, otherwise must be in the
-            range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
-            the dimension to reduce is :math:`rank + dim[i]`.
-        dtype(str, optional): The dtype of output tensor. The default value is None, the dtype
-            of output is the same as input tensor.
-        keep_dim (bool, optional): Whether to reserve the reduced dimension in the
-            output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true, default
+            range :math:`[-rank(x), rank(x))`. If :math:`axis[i] < 0`,
+            the dimension to reduce is :math:`rank + axis[i]`.
+        dtype (str, optional): The dtype of output Tensor. The default value is None, the dtype
+            of output is the same as input Tensor `x`.
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the
+            output Tensor. The result Tensor will have one fewer dimension
+            than the :attr:`x` unless :attr:`keepdim` is true, default
             value is False.
-        name(str, optional): The default value is None.  Normally there is no need for
+        name (str, optional): The default value is None. Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-        Variable: Tensor, results of summation operation on the specified dim of input tensor,
-        it's data type is the same as input's Tensor.
+        Tensor: Results of summation operation on the specified axis of input Tensor `x`,
+        it's data type is the same as `x`.
 
     Raises:
-        ValueError, the :attr:`dtype` must be float64 or int64.
+        ValueError: The :attr:`dtype` must be float64 or int64.
+        TypeError: The type of :attr:`axis` must be int, list or tuple.
 
     Examples:
         .. code-block:: python
 
+            import numpy as np
             import paddle
-            import paddle.fluid as fluid
+            paddle.disable_static()
+
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
             # Each example is followed by the corresponding output tensor.
-            x = fluid.data(name='x', shape=[2, 4], dtype='float32')
+            x_data = np.array([[0.2, 0.3, 0.5, 0.9],[0.1, 0.2, 0.6, 0.7]]).astype('float32')
+            x = paddle.to_variable(x_data)
             out1 = paddle.sum(x)  # [3.5]
-            out2 = paddle.sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
-            out3 = paddle.sum(x, dim=-1)  # [1.9, 1.6]
-            out4 = paddle.sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
+            out2 = paddle.sum(x, axis=0)  # [0.3, 0.5, 1.1, 1.6]
+            out3 = paddle.sum(x, axis=-1)  # [1.9, 1.6]
+            out4 = paddle.sum(x, axis=1, keepdim=True)  # [[1.9], [1.6]]
 
             # y is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1, 2], [3, 4]],
             #      [[5, 6], [7, 8]]]
             # Each example is followed by the corresponding output tensor.
-            y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
-            out5 = paddle.sum(y, dim=[1, 2]) # [10, 26]
-            out6 = paddle.sum(y, dim=[0, 1]) # [16, 20]
-
+            y_data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]).astype('float32')
+            y = paddle.to_variable(y_data)
+            out5 = paddle.sum(y, axis=[1, 2]) # [10, 26]
+            out6 = paddle.sum(y, axis=[0, 1]) # [16, 20]
     """
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
+    if axis is not None and not isinstance(axis, (list, tuple)):
+        axis = [axis]
+
+    if not axis:
+        reduce_all_flag = True
+    else:
+        if len(axis) == len(x.shape):
+            reduce_all_flag = True
+        else:
+            reduce_all_flag = False
+
     attrs = {
-        'dim': dim if dim != None and dim != [] else [0],
-        'keep_dim': keep_dim,
-        'reduce_all': True if dim == None or dim == [] else False,
+        'dim': axis if axis != None and axis != [] and axis != () else [0],
+        'keep_dim': keepdim,
+        'reduce_all': reduce_all_flag
     }
     dtype_flag = False
     if dtype is not None:
         if dtype in ['float64', 'int64']:
-            if (convert_dtype(input.dtype) == "float32" and dtype == "float64") or \
-               (convert_dtype(input.dtype) == "int32" and dtype == "int64"):
+            if (convert_dtype(x.dtype) == "float32" and dtype == "float64") or \
+               (convert_dtype(x.dtype) == "int32" and dtype == "int64"):
                 attrs.update({
-                    'in_dtype': input.dtype,
+                    'in_dtype': x.dtype,
                     'out_dtype': convert_np_dtype_to_dtype_(dtype)
                 })
                 dtype_flag = True
@@ -610,27 +856,28 @@ def sum(input, dim=None, dtype=None, keep_dim=False, name=None):
                 format(dtype))
 
     if in_dygraph_mode():
-        reduce_all = True if dim == None or dim == [] else False
-        dim = dim if dim != None and dim != [] else [0]
+        axis = axis if axis != None and axis != [] else [0]
         if dtype_flag:
-            return core.ops.reduce_sum(input, 'dim', dim, 'keep_dim', keep_dim,
-                                       'reduce_all', reduce_all, 'in_dtype',
-                                       input.dtype, 'out_dtype',
+            return core.ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
+                                       'reduce_all', reduce_all_flag, 'in_dtype',
+                                       x.dtype, 'out_dtype',
                                        convert_np_dtype_to_dtype_(dtype))
         else:
-            return core.ops.reduce_sum(input, 'dim', dim, 'keep_dim', keep_dim,
-                                       'reduce_all', reduce_all)
+            return core.ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
+                                       'reduce_all', reduce_all_flag)
     check_variable_and_dtype(
-        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'reduce_sum')
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'sum')
+    check_type(axis, 'axis', (int, list, tuple, type(None)), 'sum')
+
     helper = LayerHelper('sum', **locals())
     if dtype_flag:
         out = helper.create_variable_for_type_inference(
             dtype=convert_np_dtype_to_dtype_(dtype))
     else:
-        out = helper.create_variable_for_type_inference(dtype=input.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='reduce_sum',
-        inputs={'X': input},
+        inputs={'X': x},
         outputs={'Out': out},
         attrs=attrs)
     return out
@@ -922,69 +1169,73 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
     return out
 
 
-def logsumexp(x, dim=None, keepdim=False, name=None):
+def logsumexp(x, axis=None, keepdim=False, name=None):
     """
-	:alias_main: paddle.logsumexp
-	:alias: paddle.logsumexp,paddle.tensor.logsumexp,paddle.tensor.math.logsumexp
-
-    This operator calculates the log of the sum of exponentials of the input Tensor.
+    This OP calculates the log of the sum of exponentials of ``x`` along ``axis`` .
 
     .. math::
        logsumexp(x) = \log\sum exp(x)
 
-
-    Parameters:
-       x (Variable): Input LoDTensor or Tensor. Must be one of the following types: float32, float64.
-       dim (list|int, optional): The dimensions along which the sum is performed. If :attr:`None`,
-         sum all elements of :attr:`input` and return a Tensor variable with a single element,
-         otherwise must be in the range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
-         the dimension to reduce is :math:`rank + dim[i]`.
-       keep_dim (bool, optional): Whether to reserve the reduced dimension in the output Tensor.
-         The result tensor will have one fewer dimension than the :attr:`input` unless :attr:`keep_dim`
-         is true, default value is False.
-       name (str, optional): The default value is None.  Normally there is no need for user to
-         set this property.  For more information, please refer to :ref:`api_guide_Name`
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int|list|tuple, optional): The axis along which to perform
+            logsumexp calculations. ``axis`` should be int, list(int) or
+            tuple(int). If ``axis`` is a list/tuple of dimension(s), logsumexp
+            is calculated along all element(s) of ``axis`` . ``axis`` or
+            element(s) of ``axis`` should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` or element(s) of ``axis`` is
+            less than 0, it works the same way as :math:`axis + D` . If
+            ``axis`` is None, logsumexp is calculated along all elements of
+            ``x``. Default is None.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keep_dim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-       Variable: The calcuated result Tensor/LoDTensor.
+        Tensor, results of logsumexp along ``axis`` of ``x``, with the same data
+        type as ``x``.
 
     Examples:
 
     .. code-block:: python
 
         import paddle
-        import paddle.fluid as fluid
         import numpy as np
 
-        with fluid.dygraph.guard():
-          np_x = np.random.uniform(0.1, 1, [10]).astype(np.float32)
-          x = fluid.dygraph.to_variable(np_x)
-          print(paddle.logsumexp(x).numpy())
-
-    ..  code-block:: python
-
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+        paddle.disable_static()
 
-        with fluid.dygraph.guard():
-            np_x = np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            print(paddle.logsumexp(x, dim=1).numpy())
-            print(paddle.logsumexp(x, dim=[0, 2]).numpy())
+        x = np.array([[-1.5, 0., 2.], [3., 1.2, -2.4]])
+        x = paddle.to_tensor(x)
+        out1 = paddle.logsumexp(x) # [3.4691226]
+        out2 = paddle.logsumexp(x, 1) # [2.15317821, 3.15684602]
 
     """
-    op_type = 'logsumexp'
-    assert x is not None, 'x cannot be None in {}'.format(op_type)
+    if isinstance(axis, int):
+        axis = [axis]
+    reduce_all = True if axis is None \
+        or len(axis)==0 \
+        or len(axis) == len(x.shape) else False
+    if axis is None or len(axis) == 0:
+        axis = [0]
 
-    # reduce_sum does not support float16
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
-
-    exp_out = layers.exp(x)
-    sum_out = layers.reduce_sum(exp_out, dim, keepdim)
+    if in_dygraph_mode():
+        return core.ops.logsumexp(x, 'dim', axis, 'keep_dim', keepdim,
+                                    'reduce_all', reduce_all)
 
-    return layers.log(sum_out, name)
+    check_variable_and_dtype(x, 'x',
+                             ['float32', 'float64'],
+                             'logsumexp')
 
+    helper = LayerHelper('logsumexp', **locals())
+    attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='logsumexp', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
+    return out
 
 
 def inverse(x, name=None):
@@ -1314,14 +1565,14 @@ def addcmul(input, tensor1, tensor2, value=1.0, name=None):
     return out
 
 
-def clamp(input, min=None, max=None, name=None):
+def clip(x, min=None, max=None, name=None):
     """
-	:alias_main: paddle.clamp
-	:alias: paddle.clamp,paddle.tensor.clamp,paddle.tensor.math.clamp
+        :alias_main: paddle.clip
+        :alias: paddle.clip,paddle.tensor.clip,paddle.tensor.math.clip
 
-    **clampe layer**
+    **clip layer**
 
-    This operator clamps all elements in input into the range [ min, max ] and return
+    This operator clip all elements in input into the range [ min, max ] and return
     a resulting tensor as the following equation:
 
     .. math::
@@ -1329,60 +1580,63 @@ def clamp(input, min=None, max=None, name=None):
         Out = MIN(MAX(x, min), max)
 
     Args:
-        input (Variable): An input N-D Tensor or LoDTensor
-            with data type float32, float64.
-        min (float32|Variable): The lower bound with type ``float32`` or a ``Tensor``
+        x (Tensor): An N-D Tensor with data type float32 or float64.
+        min (float32|Tensor): The lower bound with type ``float32`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
-        max (float32|Variable): The upper bound with type ``float32`` or a ``Tensor``
+        max (float32|Tensor): The upper bound with type ``float32`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: A Tensor or LodTensor with the same data type and data shape as input's.
+        Tensor: A Tensor with the same data type and data shape as input.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
             import numpy as np
 
-            in1 = np.array([[1.2,3.5],
-                            [4.5,6.4]]).astype('float32')
-            with fluid.dygraph.guard():
-                x1 = fluid.dygraph.to_variable(in1)
-                out1 = paddle.tensor.clamp(x1, min=3.5, max=5.0)
-                out2 = paddle.tensor.clamp(x1, min=2.5)
-                print(out1.numpy())
-                # [[3.5, 3.5]
-                # [4.5, 5.0]]
-                print(out2.numpy())
-                # [[2.5, 3.5]
-                # [[4.5, 6.4]
+            paddle.disable_static()
+            x = np.array([[1.2,3.5], [4.5,6.4]]).astype('float32')
+            x1 = paddle.to_variable(x)
+            out1 = paddle.clip(x1, min=3.5, max=5.0)
+            out2 = paddle.clip(x1, min=2.5)
+            print(out1.numpy())
+            # [[3.5, 3.5]
+            # [4.5, 5.0]]
+            print(out2.numpy())
+            # [[2.5, 3.5]
+            # [[4.5, 6.4]
     """
 
-    assert min is not None or max is not None, "either min or max should be defined."
+    np_dtype = np.float32
+    if x.dtype == VarDesc.VarType.FP64:
+        np_dtype = np.float64
+    fmin = float(np.finfo(np_dtype).min)
+    fmax = float(np.finfo(np_dtype).max)
 
     if in_dygraph_mode():
-        min = sys.float_info.min if min is None else min
-        max = sys.float_info.max if max is None else max
-        return core.ops.clip(input, "min", min, "max", max)
+        min = fmin if min is None else min
+        max = fmax if max is None else max
+        return core.ops.clip(x, "min", min, "max", max)
 
     if min is not None:
-        check_type(min, 'min', (float, Variable), 'clamp')
+        check_type(min, 'min', (float, int, Variable), 'clip')
         if isinstance(min, Variable):
             check_dtype(min.dtype, 'min', ['float32', 'float64', 'int32'],
-                        'clamp', '(When the type of min in clamp is Variable.)')
+                        'clip', '(When the type of min in clip is Variable.)')
     if max is not None:
-        check_type(max, 'max', (float, Variable), 'clamp')
+        check_type(max, 'max', (float, int, Variable), 'clip')
         if isinstance(max, Variable):
             check_dtype(max.dtype, 'max', ['float32', 'float64', 'int32'],
-                        'clamp', '(When the type of max in clamp is Variable.)')
+                        'clip', '(When the type of max in clip is Variable.)')
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'clip')
 
-    inputs = {'X': input}
-    attrs = {'min': sys.float_info.min, 'max': sys.float_info.max}
+    inputs = {'X': x}
+    attrs = {'min': fmin, 'max': fmax}
 
     if isinstance(min, Variable):
         min.stop_gradient = True
@@ -1396,9 +1650,9 @@ def clamp(input, min=None, max=None, name=None):
     elif max is not None:
         attrs['max'] = max
 
-    helper = LayerHelper('clamp', **locals())
+    helper = LayerHelper('clip', **locals())
     output = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
+        dtype=helper.input_dtype())
     helper.append_op(
         type='clip', inputs=inputs, outputs={'Out': [output]}, attrs=attrs)
 
@@ -1622,3 +1876,255 @@ def cumsum(x, axis=None, dtype=None, name=None):
             kwargs[name] = val
     _cum_sum_ = generate_layer_fn('cumsum')
     return _cum_sum_(**kwargs)
+
+def isfinite(x, name=None):
+    """
+
+    Return whether every element of input tensor is finite number or not.
+
+    Args:
+        x (Tensor): The input tensor, it's data type should be float16, float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        `Tensor`, the bool result which shows every element of `x` whether it is finite number or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
+            x = paddle.to_tensor(x_np)
+            out = paddle.tensor.isfinite(x)
+            print(out.numpy())  # [False  True  True False  True False False]
+    """
+    if in_dygraph_mode():
+        return core.ops.isfinite_v2(x)
+    helper = LayerHelper("isfinite_v2", **locals())
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isfinite')
+    out = helper.create_variable_for_type_inference('bool')
+    helper.append_op(type="isfinite_v2", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+def isinf(x, name=None):
+    """
+
+    Return whether every element of input tensor is `+/-INF` or not.
+
+    Args:
+        x (Tensor): The input tensor, it's data type should be float16, float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        `Tensor`, the bool result which shows every element of `x` whether it is `+/-INF` or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
+            x = paddle.to_tensor(x_np)
+            out = paddle.tensor.isinf(x)
+            print(out.numpy())  # [ True False False  True False False False]
+    """
+    if in_dygraph_mode():
+        return core.ops.isinf_v2(x)
+    helper = LayerHelper("isinf_v2", **locals())
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isinf')
+    out = helper.create_variable_for_type_inference(dtype='bool')
+    helper.append_op(type="isinf_v2", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+def isnan(x, name=None):
+    """
+
+    Return whether every element of input tensor is `NaN` or not.
+
+    Args:
+        x (Tensor): The input tensor, it's data type should be float16, float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        `Tensor`, the bool result which shows every element of `x` whether it is `NaN` or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
+            x = paddle.to_tensor(x_np)
+            out = paddle.tensor.isnan(x)
+            print(out.numpy())  # [False False False False False  True  True]
+    """
+    if in_dygraph_mode():
+        return core.ops.isnan_v2(x)
+    helper = LayerHelper("isnan_v2", **locals())
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isnan')
+    out = helper.create_variable_for_type_inference(dtype='bool')
+    helper.append_op(type="isnan_v2", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+
+def prod(x, axis=None, keepdim=False, dtype=None, name=None):
+    """
+    Compute the product of tensor elements over the given axis.
+
+    Args:
+        x(Tensor): The input tensor, its data type should be float32, float64, int32, int64.
+        axis(int|list|tuple, optional): The axis along which the product is computed. If :attr:`None`, 
+            multiply all elements of `x` and return a Tensor with a single element, 
+            otherwise must be in the range :math:`[-x.ndim, x.ndim)`. If :math:`axis[i]<0`, 
+            the axis to reduce is :math:`x.ndim + axis[i]`. Default is None.
+        dtype(str|np.dtype, optional): The desired date type of returned tensor, can be float32, float64, 
+            int32, int64. If specified, the input tensor is casted to dtype before operator performed. 
+            This is very useful for avoiding data type overflows. The default value is None, the dtype 
+            of output is the same as input Tensor `x`.
+        keepdim(bool, optional): Whether to reserve the reduced dimension in the output Tensor. The result 
+            tensor will have one fewer dimension than the input unless `keepdim` is true. Default is False.
+        name(string, optional): The default value is None. Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor, result of product on the specified dim of input tensor.
+
+    Raises:
+        ValueError: The :attr:`dtype` must be float32, float64, int32 or int64.
+        TypeError: The type of :attr:`axis` must be int, list or tuple.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            # the axis is a int element
+            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
+                         [0.1, 0.2, 0.6, 0.7]]).astype(np.float32)
+            x = paddle.to_tensor(data_x)
+            out1 = paddle.prod(x)
+            print(out1.numpy())
+            # [0.0002268]
+
+            out2 = paddle.prod(x, -1)
+            print(out2.numpy())
+            # [0.027  0.0084]
+
+            out3 = paddle.prod(x, 0)
+            print(out3.numpy())
+            # [0.02 0.06 0.3  0.63]
+            print(out3.numpy().dtype)
+            # float32
+
+            out4 = paddle.prod(x, 0, keepdim=True)
+            print(out4.numpy())
+            # [[0.02 0.06 0.3  0.63]]
+
+            out5 = paddle.prod(x, 0, dtype='int64')
+            print(out5.numpy())
+            # [0 0 0 0]
+            print(out5.numpy().dtype)
+            # int64
+
+            # the axis is list
+            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
+                               [[5.0, 6.0], [7.0, 8.0]]])
+            y = paddle.to_tensor(data_y)
+            out6 = paddle.prod(y, [0, 1])
+            print(out6.numpy())
+            # [105. 384.]
+
+            out7 = paddle.prod(y, (1, 2))
+            print(out7.numpy())
+            # [  24. 1680.]
+
+    """
+    if dtype is not None:
+        check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], 'prod')
+        if x.dtype != convert_np_dtype_to_dtype_(dtype):
+            x = layers.cast(x, dtype)
+
+    return layers.reduce_prod(input=x, dim=axis, keep_dim=keepdim, name=name)
+
+
+def sign(x, name=None):
+    """
+    This OP returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
+
+    Args:
+        x(Tensor): The input tensor. The data type can be float16, float32 or float64.
+        name (str, optional): The default value is None. Normally there is no need for user to
+            set this property. For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor: The output sign tensor with identical shape and data type to the input :attr:`x`.
+
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle
+
+          data = np.array([3.0, 0.0, -2.0, 1.7], dtype='float32')
+          paddle.disable_static()
+          x = paddle.to_tensor(data)
+          out = paddle.sign(x=x)
+          print(out)  # [1.0, 0.0, -1.0, 1.0]
+    """
+    if in_dygraph_mode():
+        return core.ops.sign(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'sign')
+    helper = LayerHelper("sign", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(type='sign', inputs={'X': [x]}, outputs={'Out': [out]})
+
+    return out
+
+
+def tanh(x, name=None):
+    """
+    Tanh Activation Operator.
+
+    .. math::
+        out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+
+    Args:
+        x (Tensor): Input of Tanh operator, an N-D Tensor, with data type float32, float64 or float16.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Output of Tanh operator, a Tensor with same data type and shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+            x = paddle.to_tensor(x_data)
+            out = paddle.tanh(x)
+            print(out.numpy())
+            # [-0.37994896 -0.19737532  0.09966799  0.29131261]
+    """
+    if in_dygraph_mode():
+        return core.ops.tanh(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'tanh')
+    check_type(x, 'x', (Variable), 'tanh')
+    helper = LayerHelper('tanh', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='tanh', inputs={'X': x}, outputs={'Out': out})
+    return out
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index d26003fd826cfb3f3905b6aeea3e9492fab39cea..c652d0f1891c8bd0a4c85ea777527a2fd82ad11b 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -21,22 +21,411 @@ from ..fluid.framework import device_guard, in_dygraph_mode, _varbase_creator, V
 from ..fluid.layers.layer_function_generator import templatedoc
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
-from ..fluid.layers import utils, uniform_random, gaussian_random
+from ..fluid.layers import utils
 from ..fluid.layers.tensor import fill_constant
+import paddle
+import warnings
 
 from ..fluid.io import shuffle  #DEFINE_ALIAS
 
 __all__ = [
-    #       'gaussin',
-    #       'uniform',
+    'bernoulli',
+    'standard_normal',
+    'normal',
+    'uniform',
     'shuffle',
     'randn',
     'rand',
     'randint',
-    'randperm'
+    'randperm',
 ]
 
 
+def bernoulli(x, name=None):
+    """
+
+    This OP returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
+    The input ``x`` is a tensor with probabilities for generating the random binary number.
+    Each element in ``x`` should be in [0, 1], and the out is generated by:
+    
+    .. math::
+
+        out_i ~ Bernoulli (x_i)
+
+    Args:
+        x(Tensor):  A tensor with probabilities for generating the random binary number. The data type 
+            should be float32, float64.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+    Returns: 
+        Tensor: A Tensor filled with random binary number with the same shape and dtype as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import numpy as np
+
+        paddle.disable_static()
+
+        x = paddle.rand([2, 3])
+        print(x.numpy())
+        # [[0.11272584 0.3890902  0.7730957 ]
+        # [0.10351662 0.8510418  0.63806665]]
+
+        out = paddle.bernoulli(x)
+        print(out.numpy())
+        # [[0. 0. 1.]
+        # [0. 0. 1.]]
+
+    """
+
+    if in_dygraph_mode():
+        return core.ops.bernoulli(x)
+
+    check_variable_and_dtype(x, "x", ["float32", "float64"], "bernoulli")
+
+    helper = LayerHelper("randint", **locals())
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype)  # maybe set out to int32 ? 
+    helper.append_op(
+        type='bernoulli', inputs={"X": x}, outputs={'Out': out}, attrs={})
+    return out
+
+
+def gaussian_random(shape, mean=0.0, std=1.0, dtype='float32', name=None):
+    """
+    This OP returns a Tensor filled with random values sampled from a Gaussian
+    distribution, with ``shape`` and ``dtype``.
+
+    Args:
+        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+            is a list or tuple, the elements of it should be integers or Tensors
+            (with the shape [1], and the data type int32 or int64). If ``shape``
+            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
+            int64).
+        mean(float|int, optional): Mean of the output tensor, default is 0.0.
+        std(float|int, optional): Standard deviation of the output tensor, default
+            is 1.0.
+        seed(int, optional): ${seed_comment}
+        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of
+            the output Tensor. Supported data types: float32, float64.
+            Default is float32.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A Tensor filled with random values sampled from a Gaussian
+        distribution, with ``shape`` and ``dtype``. 
+    """
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+    seed = 0
+    op_type_for_check = 'gaussian_random/standard_normal/randn/normal'
+
+    if in_dygraph_mode():
+        shape = utils._convert_shape_to_list(shape)
+        return core.ops.gaussian_random('shape', shape, 'mean',
+                                        float(mean), 'std',
+                                        float(std), 'seed', seed, 'dtype',
+                                        dtype)
+
+    check_type(shape, 'shape', (list, tuple, Variable), op_type_for_check)
+    check_dtype(dtype, 'dtype', ['float32', 'float64'], op_type_for_check)
+
+    inputs = {}
+    attrs = {
+        'mean': mean,
+        'std': std,
+        'seed': seed,
+        'dtype': dtype,
+        'use_mkldnn': False
+    }
+    utils._get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type=op_type_for_check)
+
+    helper = LayerHelper('gaussian_random', **locals())
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='gaussian_random',
+        inputs=inputs,
+        outputs={'Out': out},
+        attrs=attrs)
+    out.stop_gradient = True
+    return out
+
+
+def standard_normal(shape, dtype=None, name=None):
+    """
+    This OP returns a Tensor filled with random values sampled from a standard
+    normal distribution with mean 0 and standard deviation 1, with ``shape``
+    and ``dtype``.
+
+    Args:
+        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+            is a list or tuple, the elements of it should be integers or Tensors
+            (with the shape [1], and the data type int32 or int64). If ``shape``
+            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
+            int64).
+        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+            output tensor. Supported data types: float32, float64. If ``dytpe``
+            is None, the data type is float32. Default is None.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A Tensor filled with random values sampled from a standard
+        normal distribution with mean 0 and standard deviation 1, with
+        ``shape`` and ``dtype``.
+
+    Raises:
+        TypeError: If ``shape`` is not list, tuple, Tensor.
+        TypeError: If ``dtype`` is not float32, float64.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            # example 1: attr shape is a list which doesn't contain Tensor.
+            result_1 = paddle.standard_normal(shape=[2, 3])
+            # [[-2.923464  ,  0.11934398, -0.51249987],  # random
+            #  [ 0.39632758,  0.08177969,  0.2692008 ]]  # random
+
+            # example 2: attr shape is a list which contains Tensor.
+            dim_1 = paddle.fill_constant([1], "int64", 2)
+            dim_2 = paddle.fill_constant([1], "int32", 3)
+            result_2 = paddle.standard_normal(shape=[dim_1, dim_2, 2])
+            # [[[-2.8852394 , -0.25898588],  # random
+            #   [-0.47420555,  0.17683524],  # random
+            #   [-0.7989969 ,  0.00754541]],  # random
+            #  [[ 0.85201347,  0.32320443],  # random
+            #   [ 1.1399018 ,  0.48336947],  # random
+            #   [ 0.8086993 ,  0.6868893 ]]]  # random
+
+            # example 3: attr shape is a Tensor, the data type must be int64 or int32.
+            var_shape = paddle.to_tensor(np.array([2, 3]))
+            result_3 = paddle.standard_normal(var_shape)
+            # [[-2.878077 ,  0.17099959,  0.05111201]  # random
+            #  [-0.3761474, -1.044801  ,  1.1870178 ]]  # random
+
+    """
+    if dtype is None:
+        dtype = 'float32'
+
+    return gaussian_random(
+        shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
+
+
+randn = standard_normal
+
+
+def normal(mean=0.0, std=1.0, shape=None, name=None):
+    """
+    This OP returns a Tensor filled with random values sampled from a normal
+    distribution with ``mean`` and ``std`` (standard deviation) .
+
+    If ``mean`` is a Tensor, the output Tensor has the same shape and data type as ``mean``.
+    If ``mean`` is not a Tensor and ``std`` is a Tensor, the output Tensor has the same shape and data type as ``std``.
+    If ``mean`` and ``std`` are not a Tensor, the output Tensor has the same shape as ``shape``, with data type float32.
+
+    If ``mean`` and ``std`` are Tensor, the num of elements of ``mean`` and ``std`` should be the same.
+
+    Args:
+        mean (float|Tensor, optional): The mean of the output Tensor's normal distribution.
+            If ``mean`` is float, all elements of the output Tensor shared the same mean.
+            If ``mean`` is a Tensor(data type supports float32, float64), it has per-element means.
+            Default is 0.0
+        std (float|Tensor, optional): The  standard deviation of the output Tensor's normal distribution.
+            If ``std`` is float, all elements of the output Tensor shared the same standard deviation.
+            If ``std`` is a Tensor(data type supports float32, float64), it has per-element standard deviations.
+            Defaule is 1.0
+        shape (list|tuple|Tensor, optional): The shape of the output Tensor. If ``shape``
+            is a list or tuple, the elements of it should be integers or Tensors
+            (with the shape [1], and the data type int32 or int64). If ``shape``
+            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
+            int64). If ``mean`` or ``std`` is a Tensor, the shape of the output
+            Tensor is the same as ``mean`` or ``std`` , attr ``shape`` is ignored.
+            Default is None
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor filled with random values sampled from a normal distribution with ``mean`` and ``std`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            out1 = paddle.normal(shape=[2, 3])
+            # [[ 0.17501129  0.32364586  1.561118  ]  # random
+            #  [-1.7232178   1.1545963  -0.76156676]]  # random
+
+            mean_tensor = paddle.to_tensor(np.array([1.0, 2.0, 3.0]))
+            out2 = paddle.normal(mean=mean_tensor)
+            # [ 0.18644847 -1.19434458  3.93694787]  # random
+
+            std_tensor = paddle.to_tensor(np.array([1.0, 2.0, 3.0]))
+            out3 = paddle.normal(mean=mean_tensor, std=std_tensor)
+            # [1.00780561 3.78457445 5.81058198]  # random
+
+    """
+    if not in_dygraph_mode():
+        check_type(mean, 'mean', (int, float, Variable), 'normal')
+        check_type(std, 'std', (int, float, Variable), 'normal')
+        if isinstance(mean, Variable):
+            check_dtype(
+                mean.dtype, 'mean', ['float32', 'float64'], 'normal',
+                "If mean is Tensor, it's data type only support float32, float64."
+            )
+        if isinstance(std, Variable):
+            check_dtype(
+                std.dtype, 'std', ['float32', 'float64'], 'normal',
+                "If std is Tensor, it's data type only support float32, float64."
+            )
+        if shape is not None:
+            if isinstance(shape, (list, tuple)):
+                for item in shape:
+                    check_type(item, 'shape', (int), 'normal',
+                               'Elements of shape should be int.')
+            elif isinstance(shape, Variable):
+                check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'normal')
+            else:
+                assert TypeError(
+                    'If mean and std are all not Tensor, shape should be list, tuple, Tensor.'
+                )
+
+    if isinstance(mean, Variable):
+        if isinstance(std, Variable):
+            if std.dtype != mean.dtype:
+                std = paddle.cast(std, mean.dtype)
+            mean_shape = paddle.shape(mean)
+            std = paddle.reshape(std, mean_shape)
+        else:
+            std = float(std)
+        out = standard_normal(paddle.shape(mean), mean.dtype, name)
+    elif isinstance(std, Variable):
+        mean = float(mean)
+        out = standard_normal(paddle.shape(std), std.dtype, name)
+    else:
+        return gaussian_random(shape=shape, mean=mean, std=std, name=name)
+
+    out = out * std + mean
+    if not in_dygraph_mode():
+        out.stop_grediant = True
+    return out
+
+
+def uniform(shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None):
+    """
+    This OP returns a Tensor filled with random values sampled from a uniform
+    distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
+
+    Examples:
+    ::
+        Input:
+          shape = [1, 2]
+        Output:
+          result=[[0.8505902, 0.8397286]]
+
+    Args:
+        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+            is a list or tuple, the elements of it should be integers or Tensors
+            (with the shape [1], and the data type int32 or int64). If ``shape``
+            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
+            int64).
+        dtype(str|np.dtype, optional): The data type of
+            the output Tensor. Supported data types: float32, float64.
+            Default is float32.
+        min(float|int, optional): The lower bound on the range of random values
+            to generate, ``min`` is included in the range. Default is -1.0.
+        max(float|int, optional): The upper bound on the range of random values
+            to generate, ``max`` is excluded in the range. Default is 1.0.
+        seed(int, optional): Random seed used for generating samples. 0 means
+            use a seed generated by the system. Note that if seed is not 0,
+            this operator will always generate the same random numbers every
+            time. Default is 0.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A Tensor filled with random values sampled from a uniform
+        distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
+
+    Raises:
+        TypeError: If ``shape`` is not list, tuple, Tensor.
+        TypeError: If ``dtype`` is not float32, float64.
+
+    Examples:
+        .. code-block:: python
+            
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+
+            # example 1:
+            # attr shape is a list which doesn't contain Tensor.
+            result_1 = paddle.tensor.random.uniform(shape=[3, 4])
+            # [[ 0.84524226,  0.6921872,   0.56528175,  0.71690357],
+            #  [-0.34646994, -0.45116323, -0.09902662, -0.11397249],
+            #  [ 0.433519,    0.39483607, -0.8660099,   0.83664286]]
+
+            # example 2:
+            # attr shape is a list which contains Tensor.
+            dim_1 = paddle.fill_constant([1], "int64", 2)
+            dim_2 = paddle.fill_constant([1], "int32", 3)
+            result_2 = paddle.tensor.random.uniform(shape=[dim_1, dim_2])
+            # [[-0.9951253,   0.30757582, 0.9899647 ],
+            #  [ 0.5864527,   0.6607096,  -0.8886161 ]]
+
+            # example 3:
+            # attr shape is a Tensor, the data type must be int64 or int32.
+            shape = np.array([2, 3])
+            shape_tensor = paddle.to_tensor(shape)
+            result_3 = paddle.tensor.random.uniform(shape_tensor)
+            # if shape_tensor's value is [2, 3]
+            # result_3 is:
+            # [[-0.8517412,  -0.4006908,   0.2551912 ],
+            #  [ 0.3364414,   0.36278176, -0.16085452]]
+
+
+    """
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    if in_dygraph_mode():
+        shape = utils._convert_shape_to_list(shape)
+        return core.ops.uniform_random('shape', shape, 'min',
+                                       float(min), 'max',
+                                       float(max), 'seed', seed, 'dtype', dtype)
+
+    check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
+    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform_random/rand')
+
+    inputs = dict()
+    attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
+    utils._get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand')
+
+    helper = LayerHelper("uniform_random", **locals())
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="uniform_random", inputs=inputs, attrs=attrs,
+        outputs={"Out": out})
+    return out
+
+
 def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     """
 	:alias_main: paddle.randint
@@ -78,40 +467,40 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
-
-        paddle.disable_static()
-
-        # example 1:
-        # attr shape is a list which doesn't contain Tensor.
-        result_1 = paddle.randint(low=-5, high=5, shape=[3])
-        # [0, -3, 2]
-
-        # example 2:
-        # attr shape is a list which contains Tensor.
-        dim_1 = paddle.fill_constant([1], "int64", 2)
-        dim_2 = paddle.fill_constant([1], "int32", 3)
-        result_2 = paddle.randint(low=-5, high=5, shape=[dim_1, dim_2], dtype="int32")
-        # [[0, -1, -3],
-        #  [4, -2,  0]]
-
-        # example 3:
-        # attr shape is a Tensor
-        var_shape = paddle.to_variable(np.array([3]))
-        result_3 = paddle.randint(low=-5, high=5, shape=var_shape)
-        # [-2, 2, 3]
-
-        # example 4:
-        # data type is int32
-        result_4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
-        # [-5, 4, -4]
-
-        # example 5:
-        # Input only one parameter
-        # low=0, high=10, shape=[1], dtype='int64'
-        result_5 = paddle.randint(10)
-        # [7]
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            # example 1:
+            # attr shape is a list which doesn't contain Tensor.
+            result_1 = paddle.randint(low=-5, high=5, shape=[3])
+            # [0, -3, 2]  # random
+
+            # example 2:
+            # attr shape is a list which contains Tensor.
+            dim_1 = paddle.fill_constant([1], "int64", 2)
+            dim_2 = paddle.fill_constant([1], "int32", 3)
+            result_2 = paddle.randint(low=-5, high=5, shape=[dim_1, dim_2], dtype="int32")
+            # [[0, -1, -3],  # random
+            #  [4, -2,  0]]  # random
+
+            # example 3:
+            # attr shape is a Tensor
+            var_shape = paddle.to_variable(np.array([3]))
+            result_3 = paddle.randint(low=-5, high=5, shape=var_shape)
+            # [-2, 2, 3]  # random
+
+            # example 4:
+            # data type is int32
+            result_4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
+            # [-5, 4, -4]  # random
+
+            # example 5:
+            # Input only one parameter
+            # low=0, high=10, shape=[1], dtype='int64'
+            result_5 = paddle.randint(10)
+            # [7]  # random
 
     """
     if high is None:
@@ -150,77 +539,6 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     return out
 
 
-def randn(shape, dtype=None, name=None):
-    """
-	:alias_main: paddle.randn
-	:alias: paddle.tensor.randn, paddle.tensor.random.randn
-
-    This OP returns a Tensor filled with random values sampled from a normal
-    distribution with mean 0 and standard deviation 1 (also called the standard
-    normal distribution), with ``shape`` and ``dtype``.
-
-    Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
-            is a list or tuple, the elements of it should be integers or Tensors
-            (with the shape [1], and the data type int32 or int64). If ``shape``
-            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
-            int64).
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
-            output tensor. Supported data types: float32, float64. If ``dytpe``
-            is None, the data type is float32. Default is None.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor: A Tensor filled with random values sampled from a normal
-        distribution with mean 0 and standard deviation 1 (also called the
-        standard normal distribution), with ``shape`` and ``dtype``.
-
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        TypeError: If ``dtype`` is not float32, float64.
-
-    Examples:
-        .. code-block:: python
-
-        import paddle
-        import numpy as np
-
-        paddle.disable_static()
-
-        # example 1: attr shape is a list which doesn't contain Tensor.
-        result_1 = paddle.randn(shape=[2, 3])
-        # [[-2.923464  ,  0.11934398, -0.51249987],
-        #  [ 0.39632758,  0.08177969,  0.2692008 ]]
-
-        # example 2: attr shape is a list which contains Tensor.
-        dim_1 = paddle.fill_constant([1], "int64", 2)
-        dim_2 = paddle.fill_constant([1], "int32", 3)
-        result_2 = paddle.randn(shape=[dim_1, dim_2, 2])
-        # [[[-2.8852394 , -0.25898588],
-        #   [-0.47420555,  0.17683524],
-        #   [-0.7989969 ,  0.00754541]],
-        #  [[ 0.85201347,  0.32320443],
-        #   [ 1.1399018 ,  0.48336947],
-        #   [ 0.8086993 ,  0.6868893 ]]]
-
-        # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-        var_shape = paddle.to_variable(np.array([2, 3]))
-        result_3 = paddle.randn(var_shape)
-        # [[-2.878077 ,  0.17099959,  0.05111201]
-        #  [-0.3761474, -1.044801  ,  1.1870178 ]]
-
-    """
-    if dtype is None:
-        dtype = 'float32'
-
-    out = gaussian_random(
-        shape=shape, mean=0.0, std=1.0, seed=0, dtype=dtype, name=name)
-    out.stop_gradient = True
-    return out
-
-
 @templatedoc()
 def randperm(n, dtype="int64", name=None):
     """
@@ -250,15 +568,15 @@ def randperm(n, dtype="int64", name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
+            import paddle
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        result_1 = paddle.randperm(5)
-        # [4, 1, 2, 3, 0]
+            result_1 = paddle.randperm(5)
+            # [4, 1, 2, 3, 0]  # random
 
-        result_2 = paddle.randperm(7, 'int32')
-        # [1, 6, 2, 0, 4, 3, 5]
+            result_2 = paddle.randperm(7, 'int32')
+            # [1, 6, 2, 0, 4, 3, 5]  # random
  
     """
     if not isinstance(dtype, core.VarDesc.VarType):
@@ -322,36 +640,36 @@ def rand(shape, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
-
-        paddle.disable_static()
-        # example 1: attr shape is a list which doesn't contain Tensor.
-        result_1 = paddle.rand(shape=[2, 3])
-        # [[0.451152  , 0.55825245, 0.403311  ],
-        #  [0.22550228, 0.22106001, 0.7877319 ]]
-
-        # example 2: attr shape is a list which contains Tensor.
-        dim_1 = paddle.fill_constant([1], "int64", 2)
-        dim_2 = paddle.fill_constant([1], "int32", 3)
-        result_2 = paddle.rand(shape=[dim_1, dim_2, 2])
-        # [[[0.8879919 , 0.25788337],
-        #   [0.28826773, 0.9712097 ],
-        #   [0.26438272, 0.01796806]],
-        #  [[0.33633623, 0.28654453],
-        #   [0.79109055, 0.7305809 ],
-        #   [0.870881  , 0.2984597 ]]]
-
-        # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-        var_shape = paddle.to_variable(np.array([2, 3]))
-        result_3 = paddle.rand(var_shape)
-        # [[0.22920267, 0.841956  , 0.05981819],
-        #  [0.4836288 , 0.24573246, 0.7516129 ]]
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            # example 1: attr shape is a list which doesn't contain Tensor.
+            result_1 = paddle.rand(shape=[2, 3])
+            # [[0.451152  , 0.55825245, 0.403311  ],  # random
+            #  [0.22550228, 0.22106001, 0.7877319 ]]  # random
+
+            # example 2: attr shape is a list which contains Tensor.
+            dim_1 = paddle.fill_constant([1], "int64", 2)
+            dim_2 = paddle.fill_constant([1], "int32", 3)
+            result_2 = paddle.rand(shape=[dim_1, dim_2, 2])
+            # [[[0.8879919 , 0.25788337],  # random
+            #   [0.28826773, 0.9712097 ],  # random
+            #   [0.26438272, 0.01796806]],  # random
+            #  [[0.33633623, 0.28654453],  # random
+            #   [0.79109055, 0.7305809 ],  # random
+            #   [0.870881  , 0.2984597 ]]]  # random
+
+            # example 3: attr shape is a Tensor, the data type must be int64 or int32.
+            var_shape = paddle.to_variable(np.array([2, 3]))
+            result_3 = paddle.rand(var_shape)
+            # [[0.22920267, 0.841956  , 0.05981819],  # random
+            #  [0.4836288 , 0.24573246, 0.7516129 ]]  # random
 
     """
     if dtype is None:
         dtype = 'float32'
 
-    out = uniform_random(shape, dtype, min=0.0, max=1.0, name=name)
+    out = uniform(shape, dtype, min=0.0, max=1.0, name=name)
     out.stop_gradient = True
     return out
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 0f8381d824027670250f4b59607b4275e43b5e22..eede022e05ba61bc23da517e7af7cd2eb58f5416 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -21,7 +21,6 @@ from ..fluid import core, layers
 from ..fluid.layers import argmin  #DEFINE_ALIAS
 from ..fluid.layers import has_inf  #DEFINE_ALIAS
 from ..fluid.layers import has_nan  #DEFINE_ALIAS
-from ..fluid.layers import topk  #DEFINE_ALIAS
 
 __all__ = [
     'argmax',
@@ -29,13 +28,13 @@ __all__ = [
     'argsort',
     'has_inf',
     'has_nan',
-    #       'masked_select',
+    'masked_select',
     'topk',
     'where',
     'index_select',
     'nonzero',
     'sort',
-    'index_sample'
+    'index_sample',
 ]
 
 from paddle.common_ops_import import *
@@ -125,95 +124,168 @@ def argsort(x, axis=-1, descending=False, name=None):
     return ids
 
 
-def argmax(input, axis=None, dtype=None, out=None, keepdims=False, name=None):
+def argmax(x, axis=None, dtype=None, keepdim=False, name=None):
     """
-	:alias_main: paddle.argmax
-	:alias: paddle.argmax,paddle.tensor.argmax,paddle.tensor.search.argmax
-
     This OP computes the indices of the max elements of the input tensor's
     element along the provided axis.
 
     Args:
-        input(Variable): An input N-D Tensor with type float32, float64, int16,
+        x(Tensor): An input N-D Tensor with type float32, float64, int16,
             int32, int64, uint8.
         axis(int, optional): Axis to compute indices along. The effective range
-            is [-R, R), where R is Rank(input). when axis<0, it works the same way
-            as axis+R. Default is None, it will use the last dim to select indices of max value.
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output tensor which can
+            is [-R, R), where R is x.ndim. when axis < 0, it works the same way
+            as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
+        dtype(str): Data type of the output tensor which can
                     be int32, int64. The default value is None, and it will
                     return the int64 indices.
-        out(Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of operation.
-            if out is None, a new Varibale will be create to store the result. Defalut is None.
-        keepdims(bool, optional): Keep the axis that do the select max.
+        keepdim(bool, optional): Keep the axis that selecting max. The defalut value is False.
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: A Tensor with data type int64.
+        Tensor, return the tensor of `int32` if set :attr:`dtype` is `int32`, otherwise return the tensor of `int64`
 
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.fluid as fluid
             import numpy as np
+            import paddle
 
-            in1 = np.array([[[5,8,9,5],
-                            [0,0,1,7],
-                            [6,9,2,4]],
-                            [[5,2,4,2],
-                            [4,7,7,9],
-                            [1,7,0,6]]])
-            with fluid.dygraph.guard():
-                x = fluid.dygraph.to_variable(in1)
-                out1 = paddle.argmax(input=x, axis=-1)
-                out2 = paddle.argmax(input=x, axis=0)
-                out3 = paddle.argmax(input=x, axis=1)
-                out4 = paddle.argmax(input=x, axis=2)
-                out5 = paddle.argmax(input=x, axis=2, keepdims=True)
-                print(out1.numpy())
-                # [[2 3 1]
-                #  [0 3 1]]
-                print(out2.numpy())
-                # [[0 0 0 0]
-                #  [1 1 1 1]
-                #  [0 0 0 1]]
-                print(out3.numpy())
-                # [[2 2 0 1]
-                #  [0 1 1 1]]
-                print(out4.numpy())
-                # [[2 3 1]
-                #  [0 3 1]]
-                print(out5.numpy())
-                #array([[[2],
-                #        [3],
-                #        [1]],
-                #       [[0],
-                #        [3],
-                #        [1]]])
+            paddle.disable_static()
+            data = np.array([[5,8,9,5],
+                             [0,0,1,7],
+                             [6,9,2,4]])
+            x =  paddle.to_variable(data)
+            out1 = paddle.argmax(x)
+            print(out1.numpy()) # 2
+            out2 = paddle.argmax(x, axis=1)
+            print(out2.numpy()) 
+            # [2 3 1]
+            out3 = paddle.argmax(x, axis=-1)
+            print(out3.numpy()) 
+            # [2 3 1]
     """
-    helper = LayerHelper("arg_max", **locals())
+    flatten = False
+    if axis is None:
+        flatten = True
+        axis = 0
+
+    if in_dygraph_mode():
+        if dtype != None:
+            var_dtype = convert_np_dtype_to_dtype_(dtype)
+            out = core.ops.arg_max(x, 'axis', axis, 'dtype', var_dtype,
+                                   'keepdim', keepdim, 'flatten', flatten)
+        else:
+            out = core.ops.arg_max(x, 'axis', axis, 'keepdim', keepdim,
+                                   'flatten', flatten)
+        return out
+
+    helper = LayerHelper("argmax", **locals())
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
+        'paddle.argmax')
     var_dtype = None
     attrs = {}
     if dtype is not None:
-        check_dtype(dtype, 'create data type', ['int32', 'int64'], 'arg_max')
+        if dtype not in ['int32', 'int64']:
+            raise ValueError(
+                "The value of 'dtype' in argmax op must be int32, int64, but received of {}".
+                format(dtype))
         var_dtype = convert_np_dtype_to_dtype_(dtype)
         attrs["dtype"] = var_dtype
     else:
         var_dtype = VarDesc.VarType.INT64
-    if out is None:
-        out = helper.create_variable_for_type_inference(var_dtype)
+
+    out = helper.create_variable_for_type_inference(var_dtype)
+    attrs['keepdims'] = keepdim
+    attrs['axis'] = axis
+    attrs['flatten'] = flatten
+    helper.append_op(
+        type='arg_max', inputs={'X': x}, outputs={'Out': [out]}, attrs=attrs)
+    out.stop_gradient = True
+    return out
+
+
+def argmin(x, axis=None, dtype=None, keepdim=False, name=None):
+    """
+    This OP computes the indices of the min elements of the input tensor's
+    element along the provided axis.
+
+    Args:
+        x(Tensor): An input N-D Tensor with type float32, float64, int16,
+            int32, int64, uint8.
+        axis(int, optional): Axis to compute indices along. The effective range
+            is [-R, R), where R is x.ndim. when axis < 0, it works the same way
+            as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
+        dtype(str): Data type of the output tensor which can
+                    be int32, int64. The default value is None, and it will
+                    return the int64 indices.
+        keepdim(bool, optional): Keep the axis that selecting min. The defalut value is False.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, return the tensor of `int32` if set :attr:`dtype` is `int32`, otherwise return the tensor of `int64`
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+            data = np.array([[5,8,9,5],
+                             [0,0,1,7],
+                             [6,9,2,4]])
+            x =  paddle.to_variable(data)
+            out1 = paddle.argmin(x)
+            print(out1.numpy()) # 4
+            out2 = paddle.argmin(x, axis=1)
+            print(out2.numpy()) 
+            # [0 0 2]
+            out3 = paddle.argmin(x, axis=-1)
+            print(out3.numpy()) 
+            # [0 0 2]
+    """
+    flatten = False
     if axis is None:
-        axis = -1
-    attrs['keepdims'] = keepdims
+        flatten = True
+        axis = 0
+
+    if in_dygraph_mode():
+        if dtype != None:
+            var_dtype = convert_np_dtype_to_dtype_(dtype)
+            out = core.ops.arg_min(x, 'axis', axis, 'dtype', var_dtype,
+                                   'keepdim', keepdim, 'flatten', flatten)
+        else:
+            out = core.ops.arg_min(x, 'axis', axis, 'keepdim', keepdim,
+                                   'flatten', flatten)
+        return out
+
+    helper = LayerHelper("argmin", **locals())
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
+        'paddle.argmin')
+    var_dtype = None
+    attrs = {}
+    if dtype is not None:
+        if dtype not in ['int32', 'int64']:
+            raise ValueError(
+                "The value of 'dtype' in argmin op must be int32, int64, but received of {}".
+                format(dtype))
+        var_dtype = convert_np_dtype_to_dtype_(dtype)
+        attrs["dtype"] = var_dtype
+    else:
+        var_dtype = VarDesc.VarType.INT64
+
+    out = helper.create_variable_for_type_inference(var_dtype)
+    attrs['keepdims'] = keepdim
     attrs['axis'] = axis
+    attrs['flatten'] = flatten
     helper.append_op(
-        type='arg_max',
-        inputs={'X': input},
-        outputs={'Out': [out]},
-        attrs=attrs)
+        type='arg_min', inputs={'X': x}, outputs={'Out': [out]}, attrs=attrs)
     out.stop_gradient = True
     return out
 
@@ -255,8 +327,8 @@ def index_select(x, index, axis=0, name=None):
                              [9.0, 10.0, 11.0, 12.0]])
             data_index = np.array([0, 1, 1]).astype('int32')
 
-            x = paddle.to_variable(data)
-            index = paddle.to_variable(data_index)
+            x = paddle.to_tensor(data)
+            index = paddle.to_tensor(data_index)
             out_z1 = paddle.index_select(x=x, index=index)
             #[[1. 2. 3. 4.]
             # [5. 6. 7. 8.]
@@ -629,3 +701,154 @@ def index_sample(x, index):
                 'Index': index},
         outputs={'Out': out})
     return out
+
+
+def masked_select(x, mask, name=None):
+    """
+    This OP Returns a new 1-D tensor which indexes the input tensor according to the ``mask``
+    which is a tensor with data type of bool.
+
+    Args:
+        x (Tensor): The input Tensor, the data type can be int32, int64, float32, float64. 
+        mask (Tensor): The Tensor containing the binary mask to index with, it's data type is bool.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns: A 1-D Tensor which is the same data type  as ``x``.
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of  float32, float64, int32 and int64.
+        TypeError: ``mask`` must be a Tensor and the data type of ``mask`` must be bool.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            paddle.disable_static()
+            data = np.array([[1.0, 2.0, 3.0, 4.0],
+                                [5.0, 6.0, 7.0, 8.0],
+                                [9.0, 10.0, 11.0, 12.0]]).astype('float32')
+            
+            mask_data = np.array([[True, False, False, False],
+                            [True, True, False, False],
+                            [True, False, False, False]]).astype('bool')
+            x = paddle.to_tensor(data)
+            mask = paddle.to_tensor(mask_data)
+            out = paddle.masked_select(x, mask)
+            #[1.0 5.0 6.0 9.0]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.masked_select(x, mask)
+
+    helper = LayerHelper("masked_select", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
+                             'paddle.tensor.search.mask_select')
+    check_variable_and_dtype(mask, 'mask', ['bool'],
+                             'paddle.tensor.search.masked_select')
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='masked_select', inputs={'X': x,
+                                      'Mask': mask}, outputs={'Y': out})
+    return out
+
+
+def topk(x, k, axis=None, largest=True, sorted=True, name=None):
+    """
+    This OP is used to find values and indices of the k largest or smallest at the optional axis.
+    If the input is a 1-D Tensor, finds the k largest or smallest values and indices.
+    If the input is a Tensor with higher rank, this operator computes the top k values and indices along the :attr:`axis`.
+
+    Args:
+        x(Tensor): Tensor, an input N-D Tensor with type float32, float64, int32, int64.
+        k(int, Tensor): The number of top elements to look for along the axis.
+        axis(int, optional): Axis to compute indices along. The effective range
+            is [-R, R), where R is x.ndim. when axis < 0, it works the same way
+            as axis + R. Default is -1.
+        largest(bool, optional) : largest is a flag, if set to true,
+            algorithm will sort by descending order, otherwise sort by
+            ascending order. Default is True.
+        sorted(bool, optional): controls whether to return the elements in sorted order, default value is True. In gpu device, it always return the sorted value. 
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        tuple(Tensor), return the values and indices. The value data type is the same as the input `x`. The indices data type is int64.
+
+    Examples:
+
+        .. code-block:: python
+
+           import numpy as np
+           import paddle
+
+           paddle.disable_static()
+
+           data_1 = np.array([1, 4, 5, 7])
+           tensor_1 = paddle.to_tensor(data_1)
+           value_1, indices_1 = paddle.topk(tensor_1, k=1)
+           print(value_1.numpy())
+           # [7]
+           print(indices_1.numpy())
+           # [3] 
+           data_2 = np.array([[1, 4, 5, 7], [2, 6, 2, 5]])
+           tensor_2 = paddle.to_tensor(data_2)
+           value_2, indices_2 = paddle.topk(tensor_2, k=1)
+           print(value_2.numpy())
+           # [[7]
+           #  [6]]
+           print(indices_2.numpy())
+           # [[3]
+           #  [1]]
+           value_3, indices_3 = paddle.topk(tensor_2, k=1, axis=-1)
+           print(value_3.numpy())
+           # [[7]
+           #  [6]]
+           print(indices_3.numpy())
+           # [[3]
+           #  [1]]
+           value_4, indices_4 = paddle.topk(tensor_2, k=1, axis=0)
+           print(value_4.numpy())
+           # [[2 6 5 7]]
+           print(indices_4.numpy())
+           # [[1 1 0 0]]
+
+    """
+    if in_dygraph_mode():
+        k = k.numpy().item(0) if isinstance(k, Variable) else k
+        if axis is None:
+            out, indices = core.ops.top_k_v2(x, 'k',
+                                             int(k), 'largest', largest,
+                                             'sorted', sorted)
+        else:
+            out, indices = core.ops.top_k_v2(x, 'k',
+                                             int(k), 'axis', axis, 'largest',
+                                             largest, 'sorted', sorted)
+        return out, indices
+
+    helper = LayerHelper("top_k_v2", **locals())
+    inputs = {"X": [x]}
+    attrs = {}
+    if isinstance(k, Variable):
+        inputs['K'] = [k]
+    else:
+        attrs = {'k': k}
+    attrs['largest'] = largest
+    attrs['sorted'] = sorted
+    if axis is not None:
+        attrs['axis'] = axis
+
+    values = helper.create_variable_for_type_inference(dtype=x.dtype)
+    indices = helper.create_variable_for_type_inference(dtype="int64")
+
+    helper.append_op(
+        type="top_k_v2",
+        inputs=inputs,
+        outputs={"Out": [values],
+                 "Indices": [indices]},
+        attrs=attrs)
+    indices.stop_gradient = True
+    return values, indices
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 7d22a0be5b0a9a2088f22535c6e2e56f7dc1f959..91676a6316b81a1998b9b48fb9ea7fcba6d67c25 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -15,9 +15,10 @@
 # TODO: define statistical functions of a tensor  
 from ..fluid.layers import reduce_mean  #DEFINE_ALIAS
 
-__all__ = ['mean', 'reduce_mean', 'std', 'var']
+__all__ = ['mean', 'reduce_mean', 'std', 'var', 'numel']
 
 import numpy as np
+from ..fluid.framework import Variable
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.framework import core, in_dygraph_mode
 from ..fluid import layers
@@ -31,8 +32,7 @@ def mean(x, axis=None, keepdim=False, name=None):
     Computes the mean of the input tensor's elements along ``axis``.
 
     Args:
-        x (Tensor): The input Tensor with data type float32, float64, int32,
-            int64.
+        x (Tensor): The input Tensor with data type float32, float64.
         axis (int|list|tuple, optional): The axis along which to perform mean
             calculations. ``axis`` should be int, list(int) or tuple(int). If
             ``axis`` is a list/tuple of dimension(s), mean is calculated along
@@ -40,9 +40,9 @@ def mean(x, axis=None, keepdim=False, name=None):
             should be in range [-D, D), where D is the dimensions of ``x`` . If
             ``axis`` or element(s) of ``axis`` is less than 0, it works the
             same way as :math:`axis + D` . If ``axis`` is None, mean is
-            calculated along all elements of ``x``. Default is None.
+            calculated over all elements of ``x``. Default is None.
         keepdim (bool, optional): Whether to reserve the reduced dimension(s)
-            in the output Tensor. If ``keep_dim`` is True, the dimensions of
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
             the output Tensor is the same as ``x`` except in the reduced
             dimensions(it is of size 1 in this case). Otherwise, the shape of
             the output Tensor is squeezed in ``axis`` . Default is False.
@@ -67,7 +67,7 @@ def mean(x, axis=None, keepdim=False, name=None):
                           [[13, 14, 15, 16],
                            [17, 18, 19, 20],
                            [21, 22, 23, 24]]], 'float32')
-            x = paddle.to_variable(x)
+            x = paddle.to_tensor(x)
             out1 = paddle.mean(x)
             # [12.5]
             out2 = paddle.mean(x, axis=-1)
@@ -96,9 +96,12 @@ def mean(x, axis=None, keepdim=False, name=None):
         return core.ops.reduce_mean(x, 'dim', axis, 'keep_dim', keepdim,
                                     'reduce_all', reduce_all)
 
-    check_variable_and_dtype(x, 'x/input',
-                             ['float32', 'float64', 'int32', 'int64'],
+    check_variable_and_dtype(x, 'x/input', ['float32', 'float64'],
                              'mean/reduce_mean')
+    check_type(axis, 'axis/dim', (int, list, tuple), 'mean/reduce_mean')
+    if isinstance(axis, (list, tuple)):
+        for item in axis:
+            check_type(item, 'elements of axis/dim', (int), 'mean/reduce_mean')
 
     helper = LayerHelper('mean', **locals())
     attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
@@ -108,139 +111,155 @@ def mean(x, axis=None, keepdim=False, name=None):
     return out
 
 
-def var(input, axis=None, keepdim=False, unbiased=True, out=None, name=None):
+def var(x, axis=None, unbiased=True, keepdim=False, name=None):
     """
-	:alias_main: paddle.var
-	:alias: paddle.var,paddle.tensor.var,paddle.tensor.stat.var
-
-    Computes the variance of the input Variable's elements along the specified 
-    axis.
+    Computes the variance of ``x`` along ``axis`` .
 
     Args:
-        input (Variable): The input Variable to be computed variance, with data 
-            type float32 and float64 supported.
-        axis (list|int, optional): The axis along which the variance is computed. 
-            If `None`, compute the variance over all elements of :attr:`input`
-            and return a Variable with a single element, otherwise it must be in 
-            the range :math:`[-rank(input), rank(input))`. If :math:`axis[i] < 0`, 
-            the axis to compute is :math:`rank(input) + axis[i]`.
-        keepdim (bool, optional): Whether to reserve the reduced dimensions in 
-            the output Variable. The dimensions in :attr:`axis` will be squeezed 
-            and the result Variable will have :attr:`len(axis)` fewer dimensions 
-            than the :attr:`input` unless :attr:`keepdim` is true, default False.
-        unbiased (bool, optional): Whether to compute variance via the unbiased 
-            estimator, in which the divisor used in the computation is 
-            :math:`N - 1`, where :math:`N` represents the number of elements 
-            along :attr:`axis`, otherwise the divisor is :math:`N`. Default True.
-        out (Variable, optional): Alternate output Variable to store the result
-            variance. Default None.
-        name (str, optional): The name for this layer. Normally there is no 
-            need for user to set this property.  For more information, please 
-            refer to :ref:`api_guide_Name`. Default None.
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int|list|tuple, optional): The axis along which to perform
+            variance calculations. ``axis`` should be int, list(int) or
+            tuple(int). If ``axis`` is a list/tuple of dimension(s), variance
+            is calculated along all element(s) of ``axis`` . ``axis`` or
+            element(s) of ``axis`` should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` or element(s) of ``axis`` is less
+            than 0, it works the same way as :math:`axis + D` . If ``axis`` is
+            None, variance is calculated over all elements of ``x``. Default
+            is None.
+        unbiased (bool, optional): Whether to use the unbiased estimation. If
+            ``unbiased`` is True, the divisor used in the computation is
+            :math:`N - 1`, where :math:`N` represents the number of elements
+            along ``axis`` , otherwise the divisor is :math:`N`. Default is True.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: The result variance with the same dtype as :attr:`input`. 
-            If :attr:`out = None`, returns a new Variable containing the 
-            variance, otherwise returns a reference to the output Variable.
+        Tensor, results of variance along ``axis`` of ``x``, with the same data
+        type as ``x``.
 
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
-            import paddle.fluid.dygraph as dg
-
-            a = np.array([[1.0, 2.0], [3.0, 4.0]]).astype("float32")
-            with dg.guard():
-                data = dg.to_variable(a)
-                variance = paddle.var(data, axis=[1])
-                print(variance.numpy())   
-                # [0.5 0.5]
+            import numpy as np
+            
+            paddle.disable_static()
+
+            x = np.array([[1.0, 2.0, 3.0], [1.0, 4.0, 5.0]])
+            x = paddle.to_tensor(x)
+            out1 = paddle.var(x)
+            # [2.66666667]
+            out2 = paddle.var(x, axis=1)
+            # [1.         4.33333333]
     """
-    dtype = convert_dtype(input.dtype)
-    if dtype not in ["float32", "float64"]:
-        raise ValueError("Layer tensor.var() only supports floating-point "
-                         "dtypes, but received {}.".format(dtype))
-    rank = len(input.shape)
-    axes = axis if axis != None and axis != [] else range(rank)
-    axes = [e if e >= 0 else e + rank for e in axes]
-    inp_shape = input.shape if in_dygraph_mode() else layers.shape(input)
-    mean = layers.reduce_mean(input, dim=axis, keep_dim=True, name=name)
-    tmp = layers.reduce_mean(
-        (input - mean)**2, dim=axis, keep_dim=keepdim, name=name)
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'var')
 
+    u = mean(x, axis, True, name)
+    out = paddle.sum((x - u)**2, axis, keepdim=keepdim, name=name)
+
+    n = paddle.cast(paddle.numel(x), x.dtype) \
+        / paddle.cast(paddle.numel(out), x.dtype)
     if unbiased:
-        n = 1
-        for i in axes:
-            n *= inp_shape[i]
-        if not in_dygraph_mode():
-            n = layers.cast(n, dtype)
-            zero_const = layers.fill_constant(shape=[1], dtype=dtype, value=0.0)
-            factor = where(n > 1.0, n / (n - 1.0), zero_const)
-        else:
-            factor = n / (n - 1.0) if n > 1.0 else 0.0
-        tmp *= factor
-    if out:
-        layers.assign(input=tmp, output=out)
-        return out
-    else:
-        return tmp
-
-
-def std(input, axis=None, keepdim=False, unbiased=True, out=None, name=None):
+        one_const = paddle.ones([1], x.dtype)
+        n = where(n > one_const, n - 1., one_const)
+    out /= n
+    return out
+
+
+def std(x, axis=None, unbiased=True, keepdim=False, name=None):
+    """
+    Computes the standard-deviation of ``x`` along ``axis`` .
+
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int|list|tuple, optional): The axis along which to perform
+            standard-deviation calculations. ``axis`` should be int, list(int)
+            or tuple(int). If ``axis`` is a list/tuple of dimension(s),
+            standard-deviation is calculated along all element(s) of ``axis`` .
+            ``axis`` or element(s) of ``axis`` should be in range [-D, D),
+            where D is the dimensions of ``x`` . If ``axis`` or element(s) of
+            ``axis`` is less than 0, it works the same way as :math:`axis + D` .
+            If ``axis`` is None, standard-deviation is calculated over all
+            elements of ``x``. Default is None.
+        unbiased (bool, optional): Whether to use the unbiased estimation. If
+            ``unbiased`` is True, the standard-deviation is calculated via the
+            unbiased estimator. If ``unbiased`` is True,  the divisor used in
+            the computation is :math:`N - 1`, where :math:`N` represents the
+            number of elements along ``axis`` , otherwise the divisor is
+            :math:`N`. Default is True.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, results of standard-deviation along ``axis`` of ``x``, with the
+        same data type as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            paddle.disable_static()
+
+            x = np.array([[1.0, 2.0, 3.0], [1.0, 4.0, 5.0]])
+            x = paddle.to_tensor(x)
+            out1 = paddle.std(x)
+            # [1.63299316]
+            out2 = paddle.std(x, axis=1)
+            # [1.       2.081666]
     """
-	:alias_main: paddle.std
-	:alias: paddle.std,paddle.tensor.std,paddle.tensor.stat.std
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'std')
 
-    Computes the standard-deviation  of the input Variable's elements along the specified 
-    axis.
+    out = var(**locals())
+    return paddle.sqrt(out)
+
+
+def numel(x, name=None):
+    """
+    Returns the number of elements for a tensor, which is a int64 Tensor with shape [1] in static mode
+    or a scalar value in imperative mode
 
     Args:
-        input (Variable): The input Variable to be computed standard-deviation, with data 
-            type float32 and float64 supported.
-        axis (list|int, optional): The axis along which the standard-deviation is computed. 
-            If `None`, compute the standard-deviation over all elements of :attr:`input`
-            and return a Variable with a single element, otherwise it must be in 
-            the range :math:`[-rank(input), rank(input))`. If :math:`axis[i] < 0`, 
-            the axis to compute is :math:`rank(input) + axis[i]`.
-        keepdim (bool, optional): Whether to reserve the reduced dimensions in 
-            the output Variable. The dimensions in :attr:`axis` will be squeezed 
-            and the result Variable will have :attr:`len(axis)` fewer dimensions 
-            than the :attr:`input` unless :attr:`keepdim` is true, default False.
-        unbiased (bool, optional): Whether to compute standard-deviation via the unbiased 
-            estimator, in which the divisor used in the computation is 
-            :math:`N - 1`, where :math:`N` represents the number of elements 
-            along :attr:`axis`, otherwise the divisor is :math:`N`. Default True.
-        out (Variable, optional): Alternate output Variable to store the result
-            standard-deviation . Default None.
-        name (str, optional): The name for this layer. Normally there is no 
-            need for user to set this property.  For more information, please 
-            refer to :ref:`api_guide_Name`. Default None.
+        x (Tensor): The input Tensor, it's data type can be bool, float16, float32, float64, int32, int64.
 
     Returns:
-        Variable: The result standard-deviation  with the same dtype as :attr:`input`. 
-            If :attr:`out = None`, returns a new Variable containing the 
-            standard-deviation , otherwise returns a reference to the output Variable.
+        Tensor: The number of elements for the input Tensor.
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
+
+
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-            # x is a Tensor variable with following elements:
-            #    [[0.2, 0.3, 0.5, 0.9]
-            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the corresponding output tensor.
-            x = fluid.data(name='x', shape=[2, 4], dtype='float32')
-            paddle.std(x)  # [0.28252685] 
-            paddle.std(x, axis=[0])  # [0.0707107, 0.07071075, 0.07071064, 0.1414217]
-            paddle.std(x, axis=[-1])  # [0.30956957, 0.29439208] 
+            
+            paddle.disable_static()
+            x = paddle.full(shape=[4, 5, 7], fill_value=0, dtype='int32')
+            numel = paddle.numel(x) # 140
+
+
     """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'std')
-
-    tmp = var(input, axis=axis, keepdim=keepdim, unbiased=unbiased, name=name)
-    tmp = layers.sqrt(tmp)
-    if out is not None:
-        layers.assign(input=tmp, output=out)
-        return out
-    else:
-        return tmp
+    if in_dygraph_mode():
+        return core.ops.size(x)
+
+    if not isinstance(x, Variable):
+        raise TypeError("x must be a Tensor in numel")
+    helper = LayerHelper('numel', **locals())
+    out = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.INT64)
+    helper.append_op(type='size', inputs={'Input': x}, outputs={'Out': out})
+    return out
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..79bec8c4ad34d682895250bc29b1fddb3a569bd4
--- /dev/null
+++ b/python/paddle/tests/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/tests/test_metrics.py b/python/paddle/tests/test_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..2272a81b3f602ec46972c9d4620ded9680e2ff5f
--- /dev/null
+++ b/python/paddle/tests/test_metrics.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+
+from paddle.incubate.hapi.utils import to_list
+
+
+def accuracy(pred, label, topk=(1, )):
+    maxk = max(topk)
+    pred = np.argsort(pred)[:, ::-1][:, :maxk]
+    correct = (pred == np.repeat(label, maxk, 1))
+
+    batch_size = label.shape[0]
+    res = []
+    for k in topk:
+        correct_k = correct[:, :k].sum()
+        res.append(float(correct_k) / batch_size)
+    return res
+
+
+def convert_to_one_hot(y, C):
+    oh = np.random.choice(np.arange(C), C, replace=False).astype('float32') / C
+    oh = np.tile(oh[np.newaxis, :], (y.shape[0], 1))
+    for i in range(y.shape[0]):
+        oh[i, int(y[i])] = 1.
+    return oh
+
+
+class TestAccuracy(unittest.TestCase):
+    def test_acc(self):
+        paddle.disable_static()
+
+        x = paddle.to_tensor(
+            np.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.4, 0.3, 0.2],
+                      [0.1, 0.2, 0.4, 0.3], [0.1, 0.2, 0.3, 0.4]]))
+        y = paddle.to_tensor(np.array([[0], [1], [2], [3]]))
+
+        m = paddle.metric.Accuracy(name='my_acc')
+
+        # check name
+        self.assertEqual(m.name(), ['my_acc'])
+
+        correct = m.compute(x, y)
+        # check results
+        self.assertEqual(m.update(correct), 0.75)
+        self.assertEqual(m.accumulate(), 0.75)
+
+        x = paddle.to_tensor(
+            np.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.3, 0.4, 0.2],
+                      [0.1, 0.2, 0.4, 0.3], [0.1, 0.2, 0.3, 0.4]]))
+        y = paddle.to_tensor(np.array([[0], [1], [2], [3]]))
+        correct = m.compute(x, y)
+        # check results
+        self.assertEqual(m.update(correct), 0.5)
+        self.assertEqual(m.accumulate(), 0.625)
+
+        # check reset
+        m.reset()
+        self.assertEqual(m.total[0], 0.0)
+        self.assertEqual(m.count[0], 0.0)
+        paddle.enable_static()
+
+
+class TestAccuracyDynamic(unittest.TestCase):
+    def setUp(self):
+        self.topk = (1, )
+        self.class_num = 5
+        self.sample_num = 1000
+        self.name = None
+
+    def random_pred_label(self):
+        label = np.random.randint(0, self.class_num,
+                                  (self.sample_num, 1)).astype('int64')
+        pred = np.random.randint(0, self.class_num,
+                                 (self.sample_num, 1)).astype('int32')
+        pred_one_hot = convert_to_one_hot(pred, self.class_num)
+        pred_one_hot = pred_one_hot.astype('float32')
+
+        return label, pred_one_hot
+
+    def test_main(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            acc = paddle.metric.Accuracy(topk=self.topk, name=self.name)
+            for _ in range(10):
+                label, pred = self.random_pred_label()
+                label_var = paddle.to_tensor(label)
+                pred_var = paddle.to_tensor(pred)
+                state = to_list(acc.compute(pred_var, label_var))
+                acc.update(* [s.numpy() for s in state])
+                res_m = acc.accumulate()
+                res_f = accuracy(pred, label, self.topk)
+                assert np.all(np.isclose(np.array(res_m, dtype='float64'),
+                              np.array(res_f, dtype='float64'), rtol=1e-3)), \
+                    "Accuracy precision error: {} != {}".format(res_m, res_f)
+                acc.reset()
+                assert np.sum(acc.total) == 0
+                assert np.sum(acc.count) == 0
+
+
+class TestAccuracyDynamicMultiTopk(TestAccuracyDynamic):
+    def setUp(self):
+        self.topk = (1, 5)
+        self.class_num = 10
+        self.sample_num = 1000
+        self.name = "accuracy"
+
+
+class TestAccuracyStatic(TestAccuracyDynamic):
+    def test_main(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        main_prog.random_seed = 1024
+        startup_prog.random_seed = 1024
+        with fluid.program_guard(main_prog, startup_prog):
+            pred = fluid.data(
+                name='pred', shape=[None, self.class_num], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            acc = paddle.metric.Accuracy(topk=self.topk, name=self.name)
+            state = acc.compute(pred, label)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        compiled_main_prog = fluid.CompiledProgram(main_prog)
+
+        for _ in range(10):
+            label, pred = self.random_pred_label()
+            state_ret = exe.run(compiled_main_prog,
+                                feed={'pred': pred,
+                                      'label': label},
+                                fetch_list=[s.name for s in to_list(state)],
+                                return_numpy=True)
+            acc.update(*state_ret)
+            res_m = acc.accumulate()
+            res_f = accuracy(pred, label, self.topk)
+            assert np.all(np.isclose(np.array(res_m), np.array(res_f), rtol=1e-3)), \
+                    "Accuracy precision error: {} != {}".format(res_m, res_f)
+            acc.reset()
+            assert np.sum(acc.total) == 0
+            assert np.sum(acc.count) == 0
+
+
+class TestAccuracyStaticMultiTopk(TestAccuracyStatic):
+    def setUp(self):
+        self.topk = (1, 5)
+        self.class_num = 10
+        self.sample_num = 100
+        self.name = "accuracy"
+
+
+class TestPrecision(unittest.TestCase):
+    def test_1d(self):
+        paddle.disable_static()
+
+        x = np.array([0.1, 0.5, 0.6, 0.7])
+        y = np.array([1, 0, 1, 1])
+
+        m = paddle.metric.Precision()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 2. / 3.)
+
+        x = paddle.to_tensor(np.array([0.1, 0.5, 0.6, 0.7, 0.2]))
+        y = paddle.to_tensor(np.array([1, 0, 1, 1, 1]))
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 4. / 6.)
+
+        paddle.enable_static()
+
+    def test_2d(self):
+        paddle.disable_static()
+
+        x = np.array([0.1, 0.5, 0.6, 0.7]).reshape(-1, 1)
+        y = np.array([1, 0, 1, 1]).reshape(-1, 1)
+
+        m = paddle.metric.Precision()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 2. / 3.)
+
+        x = np.array([0.1, 0.5, 0.6, 0.7, 0.2]).reshape(-1, 1)
+        y = np.array([1, 0, 1, 1, 1]).reshape(-1, 1)
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 4. / 6.)
+
+        # check reset
+        m.reset()
+        self.assertEqual(m.tp, 0.0)
+        self.assertEqual(m.fp, 0.0)
+        self.assertEqual(m.accumulate(), 0.0)
+
+        paddle.enable_static()
+
+
+class TestRecall(unittest.TestCase):
+    def test_1d(self):
+        paddle.disable_static()
+
+        x = np.array([0.1, 0.5, 0.6, 0.7])
+        y = np.array([1, 0, 1, 1])
+
+        m = paddle.metric.Recall()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 2. / 3.)
+
+        x = paddle.to_tensor(np.array([0.1, 0.5, 0.6, 0.7]))
+        y = paddle.to_tensor(np.array([1, 0, 0, 1]))
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 3. / 5.)
+
+        # check reset
+        m.reset()
+        self.assertEqual(m.tp, 0.0)
+        self.assertEqual(m.fn, 0.0)
+        self.assertEqual(m.accumulate(), 0.0)
+        paddle.enable_static()
+
+
+class TestAuc(unittest.TestCase):
+    def test_auc_numpy(self):
+        paddle.disable_static()
+        x = np.array([[0.78, 0.22], [0.62, 0.38], [0.55, 0.45], [0.30, 0.70],
+                      [0.14, 0.86], [0.59, 0.41], [0.91, 0.08], [0.16, 0.84]])
+        y = np.array([[0], [1], [1], [0], [1], [0], [0], [1]])
+        m = paddle.metric.Auc()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 0.8125)
+
+        m.reset()
+        self.assertEqual(m.accumulate(), 0.0)
+
+        paddle.enable_static()
+
+    def test_auc_tensor(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(
+            np.array([[0.78, 0.22], [0.62, 0.38], [0.55, 0.45], [0.30, 0.70],
+                      [0.14, 0.86], [0.59, 0.41], [0.91, 0.08], [0.16, 0.84]]))
+        y = paddle.to_tensor(np.array([[0], [1], [1], [0], [1], [0], [0], [1]]))
+        m = paddle.metric.Auc()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 0.8125)
+
+        m.reset()
+        self.assertEqual(m.accumulate(), 0.0)
+
+        paddle.enable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index ed0d84d729f0deb12d62095b3a36bac88c6c7be1..08fd7e33479b331454f63f05f6240dd221591ee9 100644
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -44,6 +44,8 @@ def deprecated(update_to="", since="", reason=""):
     """
 
     def decorator(func):
+        # TODO(zhiqiu): We temporally disable the warnings for 2.0-bata, and it should be re-enabled in the future.
+        return func
         """construct warning message, and return a decorated function or class."""
         assert isinstance(update_to, str), 'type of "update_to" must be str.'
         assert isinstance(since, str), 'type of "since" must be str.'
diff --git a/python/requirements.txt b/python/requirements.txt
index 76e6b5dedbeaf8acf726b634e3579f980ff9702f..5e81ec680897024e7c32d193bef1716e9b25b4a4 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,3 +1,4 @@
+opencv-python<=4.2.0.32
 requests>=2.20.0
 numpy>=1.13, <=1.16.4 ; python_version<"3.5"
 numpy>=1.13 ; python_version>="3.5"
@@ -22,3 +23,4 @@ objgraph
 astor
 pathlib
 netifaces
+psutil
diff --git a/python/setup.py.in b/python/setup.py.in
index 29bc68444e1e6348218a3ada3233333c2b9f1cde..5b206296bd641bf909115d1c580518afe85a37b6 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -201,6 +201,7 @@ packages=['paddle',
           'paddle.nn.functional',
           'paddle.nn.layer',
           'paddle.nn.initializer',
+          'paddle.nn.utils',
           'paddle.metric',
           'paddle.static',
           'paddle.static.nn',
@@ -302,6 +303,23 @@ if '${WITH_MKLDNN}' == 'ON':
     else:
         package_data['paddle.libs']+=['mkldnn.dll']
 
+if '${WITH_XPU}' == 'ON':
+    # only change rpath in Release mode,
+    if '${CMAKE_BUILD_TYPE}' == 'Release':
+        if os.name != 'nt':
+            if "@APPLE@" == "1":
+                command = "install_name_tool -id \"@loader_path/\" ${XPU_API_LIB}"
+            else:
+                command = "patchelf --set-rpath '$ORIGIN/' ${XPU_API_LIB}"
+            if os.system(command) != 0:
+                raise Exception("patch ${XPU_API_LIB} failed, command: %s" % command)
+    shutil.copy('${XPU_API_LIB}', libs_path)
+    shutil.copy('${XPU_RT_LIB}', libs_path)
+    shutil.copy('${XPU_SIM_LIB}', libs_path)
+    package_data['paddle.libs']+=['${XPU_API_LIB_NAME}',
+                                  '${XPU_RT_LIB_NAME}',
+                                  '${XPU_SIM_LIB_NAME}']
+
 # copy libfuild_framework.so to libs
 if os.name != 'nt' and sys.platform != 'darwin':
     paddle_framework_lib='${FLUID_FRAMEWORK_SHARED_LIB}'
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index 049621b9388997deaeab618c09c579858a60d47e..b10e76a4b4d037bfa0d72e74e660cf696f5ee1d3 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -63,12 +63,12 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /o
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index aa547fe74163fdbe08c1266817b7f1903e541ad5..9fe58885fa553671cf5c08bd51295f271f4df668 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -156,19 +156,19 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 --no-cache-dir install opencv-python && \
+    pip3 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install opencv-python && \
+    pip3.6 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 --no-cache-dir install opencv-python && \
+    pip3.7 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.8 --no-cache-dir install opencv-python && \
+    pip3.8 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip --no-cache-dir install opencv-python
+    pip --no-cache-dir install opencv-python==4.2.0.32
 
 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh
index 6f201a8579fea29ec6eaabf1faca77da26b11882..9f937cf9343784f10d186dd5bdcbace6f8a4e0e9 100755
--- a/tools/dockerfile/build_scripts/build_utils.sh
+++ b/tools/dockerfile/build_scripts/build_utils.sh
@@ -89,7 +89,7 @@ function do_cpython_build {
     fi
     # NOTE Make libpython shared library visible to python calls below
     LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
-    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel==0.32.2
     cd /
     ls ${MY_DIR}
     local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
diff --git a/tools/gen_alias_mapping.sh b/tools/gen_alias_mapping.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3ab1e68b37557404197cf552cbd0a4def08e9c41
--- /dev/null
+++ b/tools/gen_alias_mapping.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Brief:
+#     This code is used for generating the mapping list of Paddle API alias.
+#     Only the APIs set with the `DEFINE_ALIAS` flag is enable.
+# 
+# Arguments:
+#     None
+# 
+# Usage:
+#     Go into the `Paddle` folder and just run `./tools/gen_alias_mapping.sh`     
+#
+# Returns:
+#     succ: 0
+# 
+#     Will also print the mapping list to stdout. The format of each line is as below:
+#         <real API implement>\t<API recommend>,<API other alias name1>,<API other alias name2>,...
+
+
+PADDLE_ROOT="$(dirname $(readlink -f ${BASH_SOURCE[0]}))/.."
+
+find ${PADDLE_ROOT}/python/ -name '*.py' \
+    | xargs  grep -v '^#' \
+    | grep 'DEFINE_ALIAS' \
+    | perl -ne '
+        if (/\/python\/(.*):from (\.*)(\w.*) import (.*?)\s+#DEFINE_ALIAS\s+$/) {
+            my @arr = split(", ", $4); 
+            foreach $i (@arr) {
+                printf "%s|%s|%s|%d\n", $3, $i, substr($1, 0, -3), length($2);
+            }
+        }' \
+    | awk -F '[|/]' '
+        {
+            key = "";
+            val = "";
+            if ($2 ~ /.* as .*/) {
+                split($2, arr, " as ");
+                old = arr[1];
+                new = arr[2];
+            } else {
+                old = $2;
+                new = $2;
+            }
+            for (i = 3; i <= (NF - 1 - $NF); ++i) {
+                val = val""$i".";
+            }
+            val =  val""$1"."old
+            for (i = 3; i <= (NF - 1); ++i) {
+                if ($i != "__init__") {
+                    key = key""$i".";
+                }
+            }
+            key = key""new;
+            n2o[key] = val;
+        } 
+        END {
+            for (new in n2o) {
+                old = n2o[new] in n2o ? n2o[n2o[new]] : n2o[new];
+                print old, length(new), new;
+            }
+        }' \
+    | sort -k 1,1 -k 2n,2 \
+    | awk '
+        {
+            o2n[$1] = o2n[$1] ? o2n[$1]","$3 : $3;
+        }
+        END { 
+            for (i in o2n) {
+                print i"\t"o2n[i];
+            }
+        }'
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
index ffef02dba4614f7bbbe13ebc30b40438a52b4590..e3a3374b943bc955d54afbef9755ed5147fad7d2 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 
-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 
@@ -199,12 +198,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure]
 
 
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
 
 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
index 837f0e486f6112bfc645c55ded8dfd0726d414d6..c27fdcea2401c26b1ef1dd377c42930b6e74fcf0 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 
-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 
@@ -212,12 +211,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure] 
 
 
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
 
 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 102b50c43aeabc6ab2c67840edfaf42615cf51f5..033b4b8723aa30465cdb07198f470d7c09a0f326 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -480,14 +480,8 @@ def get_filenames():
                 filename = ''
                 print("\nWARNING:----Exception in get api filename----\n")
                 print("\n" + api + ' module is ' + module + "\n")
-            if filename != '':
-                # rm contrib file
-                if filename.startswith(
-                        '../python/paddle/fluid/contrib'
-                ) or filename == '../python/paddle/verison.py':
-                    pass
-                elif filename not in filenames:
-                    filenames.append(filename)
+            if filename != '' and filename not in filenames:
+                filenames.append(filename)
             # get all methods
             method = ''
             if inspect.isclass(eval(api)):
@@ -557,14 +551,18 @@ def get_wlist():
 
     '''
     wlist = []
+    wlist_file = []
     with open("wlist.json", 'r') as load_f:
         load_dict = json.load(load_f)
         for key in load_dict:
-            wlist = wlist + load_dict[key]
-    return wlist
+            if key == 'wlist_file':
+                wlist_file = wlist_file + load_dict[key]
+            else:
+                wlist = wlist + load_dict[key]
+    return wlist, wlist_file
 
 
-wlist = get_wlist()
+wlist, wlist_file = get_wlist()
 
 if len(sys.argv) < 2:
     print("Error: inadequate number of arguments")
@@ -590,8 +588,14 @@ else:
     if len(filenames) == 0 and len(whl_error) == 0:
         print("-----API_PR.spec is the same as API_DEV.spec-----")
         exit(0)
-    elif '../python/paddle/fluid/core_avx.py' in filenames:
-        filenames.remove('../python/paddle/fluid/core_avx.py')
+    rm_file = []
+    for f in filenames:
+        for w_file in wlist_file:
+            if f.startswith(w_file):
+                rm_file.append(f)
+                filenames.remove(f)
+    if len(rm_file) != 0:
+        print("REMOVE white files: %s" % rm_file)
     print("API_PR is diff from API_DEV: %s" % filenames)
     one_part_filenum = int(math.ceil(len(filenames) / cpus))
     if one_part_filenum == 0:
diff --git a/tools/wlist.json b/tools/wlist.json
index 6989882504eded7c56851e6e9351cef9b4975137..20f6a9cbaedb391995b3757612ec24f2061a8a81 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -1,4 +1,10 @@
 {
+    "wlist_file" : [
+        "../python/paddle/fluid/contrib", 
+        "../python/paddle/verison.py",
+        "../python/paddle/fluid/core_avx.py",
+        "../python/paddle/distributed"
+    ],
     "wlist_inneed":[
         "append_LARS",
         "BuildStrategy.debug_graphviz_path",
@@ -63,7 +69,6 @@
         "Compressor",
         "Compressor.config",
         "Compressor.run",
-        "run_check",
         "HDFSClient.upload",
         "HDFSClient.download",
         "HDFSClient.is_exist",
@@ -107,12 +112,27 @@
         "Metric.update",
         "Metric.accumulate",
         "Metric.name",
-        "Metric.add_metric_op",
+        "Metric.compute",
         "Accuracy.reset",
         "Accuracy.update",
         "Accuracy.accumulate",
         "Accuracy.name",
-        "Accuracy.add_metric_op",
+        "Accuracy.compute",
+        "Precision.reset",
+        "Precision.update",
+        "Precision.accumulate",
+        "Precision.name",
+        "Precision.compute",
+        "Recall.reset",
+        "Recall.update",
+        "Recall.accumulate",
+        "Recall.name",
+        "Recall.compute",
+        "Auc.reset",
+        "Auc.update",
+        "Auc.accumulate",
+        "Auc.name",
+        "Auc.compute",
         "Callback.set_params",
         "Callback.on_train_begin",
         "Callback.on_train_end",
@@ -128,7 +148,20 @@
         "Callback.on_eval_batch_end",
         "Callback.on_test_batch_begin",
         "Callback.on_test_batch_end",
-        "Model.prepare"
+        "Model.prepare",
+        "SimpleRNNCell",
+        "SimpleRNNCell.forward",
+        "LSTMCell",
+        "LSTMCell.forward",
+        "GRUCell",
+        "GRUCell.forward",
+        "SimpleRNN",
+        "GRU",
+        "LSTM",
+        "RNN",
+        "BiRNN",
+        "RNNCellBase",
+        "RNNCellBase.get_initial_states"
     ],
     "wlist_no_op_pass":[
         "gelu",
@@ -214,6 +247,7 @@
         "prroi_pool"
     ],
     "wlist_temp":[
+        "to_tensor",
         "ChunkEvaluator",
         "EditDistance",
         "ErrorClipByValue",