update

896ed573 · jingqinghe · 245d5528 · c80fcf90 · 896ed573 · 896ed573
578 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,10 @@ include(generic)            # simplify cmake module
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN"        OFF)
+if (WITH_GPU  AND WITH_XPU)
+    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
+endif()
 # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
 if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
    message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "

--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ pip install paddlepaddle
 # Linux GPU cuda10cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.8.3.post97
+pip install paddlepaddle-gpu==1.8.4.post97
 ```
 It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website.

--- a/README_cn.md
+++ b/README_cn.md
@@ -30,7 +30,7 @@ pip install paddlepaddle
 # Linux GPU cuda10cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.8.3.post97
+pip install paddlepaddle-gpu==1.8.4.post97
 ```
 更多安装信息详见官网 [安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/install/index_cn.html)

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -63,6 +63,11 @@ if(WITH_BOX_PS)
    add_definitions(-DPADDLE_WITH_BOX_PS)
 endif()
+if(WITH_XPU)
+    message(STATUS "Compile with XPU!")
+    add_definitions(-DPADDLE_WITH_XPU)
+endif()
 if(WITH_GPU)
    add_definitions(-DPADDLE_WITH_CUDA)
    add_definitions(-DEIGEN_USE_GPU)

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -61,6 +61,10 @@ function(detect_installed_gpus out_variable)
  if(NOT CUDA_gpu_detect_output)
    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+    #Todo: fix Automatic GPU detection failed on windows
+    if(WIN32)
+      set(${out_variable} "61 75" PARENT_SCOPE)
+    endif()
  else()
    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
  endif()

--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -17,7 +17,7 @@ include(ExternalProject)
 set(CUB_PREFIX_DIR ${THIRD_PARTY_PATH}/cub)
 set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub/src/extern_cub)
 set(CUB_REPOSITORY https://github.com/NVlabs/cub.git)
-set(CUB_TAG        1.9.8)
+set(CUB_TAG        1.8.0)
 cache_third_party(extern_cub
    REPOSITORY    ${CUB_REPOSITORY}

--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -14,13 +14,21 @@
 INCLUDE(ExternalProject)
+execute_process(COMMAND bash -c "gcc -dumpversion" OUTPUT_VARIABLE GCC_VERSION)
 SET(GLOO_PROJECT       "extern_gloo")
 IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL))
  MESSAGE(STATUS "use pre defined download url")
  SET(GLOO_VER "master" CACHE STRING "" FORCE)
  SET(GLOO_NAME "gloo" CACHE STRING "" FORCE)
-  SET(GLOO_URL "https://pslib.bj.bcebos.com/gloo.tar.gz" CACHE STRING "" FORCE)
+  if(${GCC_VERSION} VERSION_EQUAL "8.2.0")
+    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc8" CACHE STRING "" FORCE)
+  else()
+    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc482" CACHE STRING "" FORCE)
+  endif()
 ENDIF()
 MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}")
 SET(GLOO_SOURCE_DIR    "${THIRD_PARTY_PATH}/gloo")
 SET(GLOO_DOWNLOAD_DIR  "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}")

--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
+if (NOT WITH_XPU)
+    return()
+endif()
+INCLUDE(ExternalProject)
+SET(XPU_PROJECT                 "extern_xpu")
+SET(XPU_URL    "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
+SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
+SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
+SET(XPU_API_INC_DIR             "${THIRD_PARTY_PATH}/install/xpu/api/include")
+SET(XPU_RUNTIME_INC_DIR         "${THIRD_PARTY_PATH}/install/xpu/runtime/include")
+SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
+SET(XPU_API_LIB_NAME            "libxpuapi.so")
+SET(XPU_RT_LIB_NAME             "libxpurt.so")
+SET(XPU_SIM_LIB_NAME            "libxpusim.so")
+SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
+SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
+SET(XPU_SIM_LIB                 "${XPU_LIB_DIR}/${XPU_SIM_LIB_NAME}")
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
+INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
+INCLUDE_DIRECTORIES(${XPU_RUNTIME_INC_DIR})
+FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(XPU)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY xpu/api xpu/runtime xpu/lib \n"
+  "        DESTINATION ${XPU_INSTALL_DIR})\n")
+ExternalProject_Add(
+    ${XPU_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${XPU_SOURCE_DIR}
+    DOWNLOAD_DIR          ${XPU_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
+                          && tar xvf xpu.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
+)
+ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
+set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
+# generate a static dummy target to track xpulib dependencies
+# for cc_library(xxx SRCS xxx.c DEPS xpulib)
+generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
+TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_SIM_LIB})
+ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -384,8 +384,8 @@ function(cc_test_run TARGET_NAME)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    # No unit test should exceed 10 minutes.
+    # No unit test should exceed 2 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
  endif()
 endfunction()
@@ -743,8 +743,8 @@ function(py_test TARGET_NAME)
               WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    endif()
-    # No unit test should exceed 10 minutes.
+    # No unit test should exceed 2 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
  endif()
 endfunction()

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -8,6 +8,7 @@ function(op_library TARGET)
    set(hip_cu_srcs)
    set(miopen_hip_cc_srcs)
    set(cu_cc_srcs)
+    set(xpu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(cudnn_cu_srcs)
    set(CUDNN_FILE)
@@ -60,6 +61,12 @@ function(op_library TARGET)
                list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
            endif()
        endif()
+        if(WITH_XPU)
+            string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
+                list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
+            endif()
+        endif()
    else()
        foreach(src ${op_library_SRCS})
            if (${src} MATCHES ".*\\.hip.cu$")
@@ -76,6 +83,8 @@ function(op_library TARGET)
                list(APPEND mkldnn_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cu.cc$")
                list(APPEND cu_cc_srcs ${src})
+            elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
+                list(APPEND xpu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cc$")
                list(APPEND cc_srcs ${src})
            else()
@@ -109,7 +118,7 @@ function(op_library TARGET)
        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
    else()
-        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
            ${op_common_deps})
    endif()
@@ -150,10 +159,11 @@ function(op_library TARGET)
    list(LENGTH cu_srcs cu_srcs_len)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
@@ -179,6 +189,9 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
    endif()
+    if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
+    endif()
    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
      # Append first implemented MKLDNN activation operator
@@ -228,6 +241,7 @@ function(register_operators)
    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
    string(REPLACE "_mkldnn" "" OPS "${OPS}")
+    string(REPLACE "_xpu" "" OPS "${OPS}")
    string(REPLACE ".cc" "" OPS "${OPS}")
    list(REMOVE_DUPLICATES OPS)
    list(LENGTH register_operators_DEPS register_operators_DEPS_len)

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -250,6 +250,11 @@ if(WITH_GPU)
    file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
 endif(WITH_GPU)
+if(WITH_XPU)
+    include(external/xpu)          # download, build, install xpu
+    list(APPEND third_party_deps extern_xpu)
+endif(WITH_XPU)
 if(WITH_PSLIB)
    include(external/pslib)          # download, build, install pslib
    list(APPEND third_party_deps extern_pslib)
@@ -263,10 +268,6 @@ if(WITH_PSLIB)
    endif()
 endif(WITH_PSLIB)
-if(NOT WIN32 AND NOT APPLE)
-    include(external/gloo)
-    list(APPEND third_party_deps extern_gloo)
-endif()
 if(WITH_BOX_PS)
    include(external/box_ps)
@@ -274,6 +275,11 @@ if(WITH_BOX_PS)
 endif(WITH_BOX_PS)
 if(WITH_DISTRIBUTE)
+    if(WITH_GLOO)
+        include(external/gloo)
+        list(APPEND third_party_deps extern_gloo)
+    endif()
    if(WITH_GRPC)
        list(APPEND third_party_deps extern_grpc)
    else()

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -122,6 +122,10 @@ cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
 cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
 device_context)
+cc_library(op_version_registry SRCS op_version_registry.cc DEPS framework_proto boost)
+cc_test(op_version_registry_test SRCS op_version_registry_test.cc DEPS op_version_registry)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(no_need_buffer_vars_inference SRCS no_need_buffer_vars_inference.cc DEPS attribute device_context)
@@ -268,6 +272,7 @@ cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatib
 cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
+cc_library(generator SRCS generator.cc)
 # Get the current working branch
 execute_process(

--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
    return ctx;
  }
+  inline ::DLContext operator()(const platform::XPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::XPUPlace is not supported"));
+  }
  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
    ::DLContext ctx;

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
  int64_t max_memory_size = GetEagerDeletionThreshold();
  std::unique_ptr<GarbageCollector> gc;
  if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
-#ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
+#ifdef PADDLE_WITH_CUDA
      if (IsFastEagerDeletionModeEnabled()) {
        gc.reset(new UnsafeFastGPUGarbageCollector(
            BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
@@ -453,14 +453,23 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
        gc.reset(new DefaultStreamGarbageCollector(
            BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
      }
-    } else if (platform::is_cpu_place(place_)) {
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
 #endif
+    } else if (platform::is_cpu_place(place_)) {
      gc.reset(new CPUGarbageCollector(
          BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
-#ifdef PADDLE_WITH_CUDA
+    } else if (platform::is_xpu_place(place_)) {
-    }
+#ifdef PADDLE_WITH_XPU
+      gc.reset(new XPUGarbageCollector(
+          BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
 #endif
    }
+  }
  for (int64_t i = start_op_index; i < end_op_index; ++i) {
    auto& op = ctx->ops_[i];

--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -19,6 +19,6 @@ else()
    cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
 endif(WITH_GLOO)
-cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context)
+cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)
 cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
  callback();
 }
+#ifdef PADDLE_WITH_XPU
+XPUGarbageCollector::XPUGarbageCollector(const platform::XPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+#endif
 #ifdef PADDLE_WITH_CUDA
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
    const platform::CUDAPlace &place, size_t max_memory_size)

--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector {
  void ClearCallback(const std::function<void()> &callback) override;
 };
+#ifdef PADDLE_WITH_XPU
+class XPUGarbageCollector : public GarbageCollector {
+ public:
+  XPUGarbageCollector(const platform::XPUPlace &place, size_t max_memory_size);
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+#endif
 #ifdef PADDLE_WITH_CUDA
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
 public:

--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <deque>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include "paddle/fluid/framework/generator.h"
+namespace paddle {
+namespace framework {
+std::shared_ptr<Generator> Generator::gen_instance_ = NULL;
+GeneratorState* Generator::GetState() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  return this->state_.get();
+}
+void Generator::SetState(GeneratorState* state_in) {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  *this->state_ = *state_in;
+}
+uint64_t Generator::GetCurrentSeed() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  return this->state_->current_seed;
+}
+uint64_t Generator::Seed() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  uint64_t seed;
+  std::random_device de;
+  seed = ((((uint64_t)de()) << 32) + de()) & 0x1FFFFFFFFFFFFF;
+  this->state_->current_seed = seed;
+  std::seed_seq seq({seed});
+  this->state_->cpu_engine.seed(seq);
+  return this->state_->current_seed;
+}
+void Generator::SetCurrentSeed(uint64_t seed) {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  this->state_->current_seed = uint64_t(seed);
+  std::seed_seq seq({seed});
+  this->state_->cpu_engine.seed(seq);
+}
+std::mt19937_64& Generator::GetCPUEngine() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  return this->state_->cpu_engine;
+}
+void Generator::SetCPUEngine(std::mt19937_64 engine) {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  this->state_->cpu_engine = std::mt19937_64(engine);
+}
+uint64_t Generator::Random64() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  return this->state_->cpu_engine();
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdint.h>
+#include <atomic>
+#include <deque>
+#include <iostream>  // temp for debug
+#include <memory>
+#include <mutex>  // NOLINT
+#include <random>
+#include <typeinfo>
+#include <utility>
+namespace paddle {
+namespace framework {
+struct GeneratorState {
+  int64_t device = -1;
+  uint64_t current_seed = 34342423252;
+  std::mt19937_64 cpu_engine;
+};
+struct Generator {
+  Generator() {
+    GeneratorState default_gen_state_cpu;
+    default_gen_state_cpu.device = -1;
+    default_gen_state_cpu.current_seed = 34342423252;
+    std::seed_seq seq({34342423252});
+    default_gen_state_cpu.cpu_engine = std::mt19937_64(seq);
+    this->state_ = std::make_shared<GeneratorState>(default_gen_state_cpu);
+  }
+  explicit Generator(GeneratorState state_in)
+      : state_{std::make_shared<GeneratorState>(state_in)} {}
+  Generator(const Generator& other)
+      : Generator(other, std::lock_guard<std::mutex>(other.mutex)) {}
+  // get random state
+  GeneratorState* GetState();
+  // set random state
+  void SetState(GeneratorState* state_in);
+  // get current seed
+  uint64_t GetCurrentSeed();
+  // random a seed and get
+  uint64_t Seed();
+  // set seed
+  void SetCurrentSeed(uint64_t seed);
+  // get cpu engine
+  std::mt19937_64& GetCPUEngine();
+  // set cpu engine
+  void SetCPUEngine(std::mt19937_64 engine);
+  uint64_t Random64();
+  bool is_init_py = false;
+  // CPU Generator singleton
+  static std::shared_ptr<Generator> GetInstance() {
+    if (NULL == gen_instance_) {
+      gen_instance_.reset(new paddle::framework::Generator());
+    }
+    return gen_instance_;
+  }
+  static std::shared_ptr<Generator> GetInstanceX() {
+    if (NULL == gen_instance_) {
+      gen_instance_.reset(new paddle::framework::Generator());
+    }
+    gen_instance_->is_init_py = true;
+    return gen_instance_;
+  }
+ private:
+  static std::shared_ptr<Generator> gen_instance_;
+  std::shared_ptr<GeneratorState> state_;
+  mutable std::mutex mutex;
+  Generator(const Generator& other, const std::lock_guard<std::mutex>&)
+      : state_(std::make_shared<GeneratorState>(*(other.state_))) {}
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -368,3 +368,7 @@ REGISTER_PASS(conv_transpose_bn_fuse_pass,
              paddle::framework::ir::ConvTransposeBNFusePass);
 REGISTER_PASS(conv_transpose_eltwiseadd_bn_fuse_pass,
              paddle::framework::ir::ConvTransposeEltwiseAddBNFusePass);
+REGISTER_PASS(depthwise_conv_bn_fuse_pass,
+              paddle::framework::ir::DepthwiseConvBNFusePass);
+REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
+              paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
@@ -56,6 +56,16 @@ class ConvTransposeEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
  std::string conv_type() const { return "conv2d_transpose"; }
 };
+class DepthwiseConvBNFusePass : public ConvBNFusePass {
+ public:
+  std::string conv_type() const { return "depthwise_conv2d"; }
+};
+class DepthwiseConvEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
+ public:
+  std::string conv_type() const { return "depthwise_conv2d"; }
+};
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/subgraph_detector.cc
+++ b/paddle/fluid/framework/ir/subgraph_detector.cc
@@ -309,7 +309,8 @@ std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubGraphs() {
    BriefNode *brief_node = itr.second;
    if (!Agent(brief_node->node).marked()) {
-      VLOG(4) << brief_node->node->id() << " node not a trt candidate.";
+      VLOG(4) << brief_node->node->id() << " node named "
+              << brief_node->node->Name() << " is not a trt candidate.";
      continue;
    }

--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
    // CPU, CUDA, PLAIN are same library type.
  } else if (s == std::string("CPU")) {
    return LibraryType::kPlain;
+  } else if (s == std::string("XPU")) {
+    return LibraryType::kPlain;
  } else if (s == std::string("CUDA")) {
    return LibraryType::kPlain;
  } else {

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -78,21 +78,37 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
                size_t j = 0) override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(i, Inputs(in).size(),
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+                      platform::errors::InvalidArgument(
+                          "The input variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Inputs(in).size(), i));
+    PADDLE_ENFORCE_LT(j, Outputs(out).size(),
+                      platform::errors::InvalidArgument(
+                          "The output variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Outputs(out).size(), j));
    std::string input_n = Inputs(in)[i];
    std::string output_n = Outputs(out)[j];
-    PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@",
+    PADDLE_ENFORCE_NE(input_n, framework::kEmptyVarName,
-                   in, i);
+                      platform::errors::InvalidArgument(
-    PADDLE_ENFORCE(output_n != framework::kEmptyVarName,
+                          "The input variable %s[%d] is empty.", in, i));
-                   "The %s[%d] is @EMPTY@", out, j);
+    PADDLE_ENFORCE_NE(output_n, framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The output variable %s[%d] is empty.", out, j));
    auto *in_var = block_.FindVarRecursive(input_n);
    auto *out_var = block_.FindVarRecursive(output_n);
-    PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(),
+    PADDLE_ENFORCE_EQ(
-                   "The type of %s and %s is not the same.", input_n, output_n);
+        in_var->GetType(), out_var->GetType(),
+        platform::errors::InvalidArgument(
+            "The type of input %s and output %s do not match. The input type "
+            "is %s, output type is %s.",
+            input_n, output_n, DataTypeToString(in_var->GetType()),
+            DataTypeToString(out_var->GetType())));
    SetDim(output_n, GetDim(input_n));
  }
@@ -126,12 +142,22 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
                size_t j = 0) const override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(i, Inputs(in).size(),
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+                      platform::errors::InvalidArgument(
-    PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName,
+                          "The input variable index is out of range, expected "
-                   "The %s[%d] is @EMPTY@", in, i);
+                          "index less than %d, but received index is %d.",
-    PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName,
+                          Inputs(in).size(), i));
-                   "The %s[%d] is @EMPTY@", out, j);
+    PADDLE_ENFORCE_LT(j, Outputs(out).size(),
+                      platform::errors::InvalidArgument(
+                          "The output variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Outputs(out).size(), j));
+    PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The input variable %s[%d] is empty.", in, i));
+    PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The output variable %s[%d] is empty.", out, j));
    auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
    auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
    if (in_var->GetType() != proto::VarType::LOD_TENSOR &&
@@ -144,30 +170,38 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  int32_t GetLoDLevel(const std::string &in, size_t i = 0) const override {
    PADDLE_ENFORCE_LT(i, Inputs(in).size(),
-                      "Input %s of operator %s only has %d elements.", in,
+                      platform::errors::InvalidArgument(
-                      op_.Type(), Inputs(in).size());
+                          "The input variable index is out of range, input "
+                          "variable %s of operator %s only has %d elements.",
+                          in, op_.Type(), Inputs(in).size()));
    PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName,
-                      "Input %s[%d] of operator %s is @EMPTY@", in, op_.Type(),
+                      platform::errors::InvalidArgument(
-                      i);
+                          "The input variable %s[%d] of operator %s is empty.",
+                          in, i, op_.Type()));
    auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
    PADDLE_ENFORCE_NOT_NULL(
-        in_var, "Input %s[%d] of operator %s should not be nullptr.", in,
+        in_var, platform::errors::NotFound(
-        op_.Type(), i);
+                    "The input variable %s[%d] of operator %s is not found.",
+                    in, i, op_.Type()));
    return in_var->GetLoDLevel();
  }
  void SetLoDLevel(const std::string &out, int32_t lod_level,
                   size_t j = 0) const override {
    PADDLE_ENFORCE_LT(j, Outputs(out).size(),
-                      "Output %s of operator %s only has %d elements.", out,
+                      platform::errors::InvalidArgument(
-                      op_.Type(), Outputs(out).size());
+                          "The output variable index is out of range, output "
+                          "variable %s of operator %s only has %d elements.",
+                          out, op_.Type(), Outputs(out).size()));
    PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName,
-                      "Output %s[%d] of operator %s is @EMPTY@", out,
+                      platform::errors::InvalidArgument(
-                      op_.Type(), j);
+                          "The output variable %s[%d] of operator %s is empty.",
+                          out, j, op_.Type()));
    auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
    PADDLE_ENFORCE_NOT_NULL(
-        out_var, "Output %s[%d] of operator %s should not be nullptr.", out,
+        out_var, platform::errors::NotFound(
-        op_.Type(), j);
+                     "The output variable %s[%d] of operator %s is not found.",
+                     out, j, op_.Type()));
    if (lod_level >= 0) {
      out_var->SetLoDLevel(lod_level);
    }
@@ -200,8 +234,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  DDim GetInputDim(const std::string &name) const override {
    const std::vector<std::string> &arg_names = Inputs(name);
    PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                      "Input(%s) should hold one element, but now it holds %d",
+                      platform::errors::InvalidArgument(
-                      name, arg_names.size());
+                          "The input(%s) should hold only one element, but now "
+                          "it holds %d elements.",
+                          name, arg_names.size()));
    return this->GetDim(arg_names[0]);
  }
@@ -225,8 +261,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  void SetOutputDim(const std::string &name, const DDim &dim) override {
    auto arg_names = Outputs(name);
    PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                      "Output(%s) should hold one element, but now it holds %d",
+                      platform::errors::InvalidArgument(
-                      name, arg_names.size());
+                          "The iutput(%s) should hold only one element, but "
+                          "now it holds %d elements.",
+                          name, arg_names.size()));
    SetDim(arg_names[0], dim);
  }
@@ -252,7 +290,8 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  DDim GetDim(const std::string &name) const {
    auto var = block_.FindVarRecursive(name);
-    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found.", name));
    DDim res;
    try {
      auto shape = var->GetShape();
@@ -278,7 +317,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  void SetDims(const std::vector<std::string> &names,
               const std::vector<DDim> &dims) {
    size_t length = names.size();
-    PADDLE_ENFORCE_EQ(length, dims.size());
+    PADDLE_ENFORCE_EQ(length, dims.size(),
+                      platform::errors::InvalidArgument(
+                          "The input variables number(%d) and input dimensions "
+                          "number(%d) do not match.",
+                          length, dims.size()));
    for (size_t i = 0; i < length; ++i) {
      if (names[i] == framework::kEmptyVarName) {
        continue;
@@ -364,8 +407,10 @@ proto::OpDesc *OpDesc::Proto() {
 const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
  auto it = inputs_.find(name);
-  PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
+  PADDLE_ENFORCE_NE(
-                 Type());
+      it, inputs_.end(),
+      platform::errors::NotFound("Input %s cannot be found in operator %s.",
+                                 name, Type()));
  return it->second;
 }
@@ -385,8 +430,10 @@ void OpDesc::SetInput(const std::string &param_name,
 const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
  auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
+  PADDLE_ENFORCE_NE(
-                 name, Type());
+      it, outputs_.end(),
+      platform::errors::NotFound("Output %s cannot be found in operator %s.",
+                                 name, Type()));
  return it->second;
 }
@@ -427,7 +474,8 @@ bool OpDesc::HasProtoAttr(const std::string &name) const {
 proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
  auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
+                                          "Attribute %s is not found.", name));
  return static_cast<proto::AttrType>(it->second.which() - 1);
 }
@@ -492,7 +540,8 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
        return;
      }
      default:
-        PADDLE_THROW("Wrong attr type %d", attr.type());
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported attribute type (code %d).", attr.type()));
    }
    need_update_ = true;
    return;
@@ -529,7 +578,8 @@ void OpDesc::SetAttrMap(
 Attribute OpDesc::GetAttr(const std::string &name) const {
  auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
+                                          "Attribute %s is not found.", name));
  return it->second;
 }
@@ -543,7 +593,8 @@ const proto::OpProto::Attr &OpDesc::GetProtoAttr(
    }
  }
-  PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type());
+  PADDLE_THROW(platform::errors::NotFound(
+      "Attribute %s is not found in proto %s.", name, proto.type()));
 }
 Attribute OpDesc::GetNullableAttr(const std::string &name) const {
@@ -557,7 +608,10 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
 std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
  auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound(
+          "Attribute `%s` is not found in operator `%s`.", name, desc_.type()));
  auto blocks = BOOST_GET_CONST(std::vector<BlockDesc *>, it->second);
  std::vector<int> ids;
@@ -570,7 +624,10 @@ std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
 int OpDesc::GetBlockAttrId(const std::string &name) const {
  auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound(
+          "Attribute `%s` is not found in operator `%s`.", name, desc_.type()));
  return BOOST_GET_CONST(BlockDesc *, it->second)->ID();
 }
@@ -657,7 +714,11 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
    VectorToRepeated(v, attr_->mutable_longs());
  }
-  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
+  void operator()(boost::blank) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method of SetAttrDescVisitor object for "
+        "`boosst::blank` type."));
+  }
 };
 void OpDesc::Flush() {
@@ -691,8 +752,9 @@ void OpDesc::Flush() {
 }
 void OpDesc::CheckAttrs() {
-  PADDLE_ENFORCE(!Type().empty(),
+  PADDLE_ENFORCE_EQ(Type().empty(), false,
-                 "CheckAttr() can not be called before type is set.");
+                    platform::errors::PreconditionNotMet(
+                        "CheckAttrs() can not be called before type is set."));
  auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
  if (checker == nullptr) {
    // checker is not configured. That operator could be generated by Paddle,
@@ -707,8 +769,10 @@ void OpDesc::InferShape(const BlockDesc &block) const {
  try {
    VLOG(3) << "CompileTime infer shape on " << Type();
    auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
-    PADDLE_ENFORCE(static_cast<bool>(infer_shape),
+    PADDLE_ENFORCE_EQ(
-                   "%s's infer_shape has not been registered", this->Type());
+        static_cast<bool>(infer_shape), true,
+        platform::errors::NotFound(
+            "Operator %s's infer_shape is not registered.", this->Type()));
    CompileTimeInferShapeContext ctx(*this, block);
    if (VLOG_IS_ON(10)) {
      std::ostringstream sout;
@@ -758,10 +822,10 @@ bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
  if (length == 0) {
    return false;
  }
-  PADDLE_ENFORCE_EQ(length, 1UL,
+  PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
                                     "Input(%s) should have only one value, "
-                    "but it have %d now",
+                                     "but it has %d values now.",
-                    name, length);
+                                     name, length));
  return block_.HasVarRecursive(input_names[0]);
 }
@@ -774,10 +838,10 @@ bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
  if (length == 0) {
    return false;
  }
-  PADDLE_ENFORCE_EQ(length, 1UL,
+  PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
                                     "Output(%s) should have only one value, "
-                    "but it have %d now",
+                                     "but it has %d values now.",
-                    name, length);
+                                     name, length));
  return block_.HasVarRecursive(output_names[0]);
 }
@@ -826,7 +890,8 @@ std::vector<std::string> CompileTimeInferShapeContext::Outputs(
 std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
    const std::string &name) const {
  auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound("Variable %s is not found.", name));
  std::vector<DDim> res;
  try {
    auto shapes = var->GetShapes();
@@ -848,7 +913,8 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
 void CompileTimeInferShapeContext::SetRepeatedDims(
    const std::string &name, const std::vector<DDim> &dims) {
  auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound("Variable %s is not found.", name));
  std::vector<std::vector<int64_t>> dim_vec(dims.size());
  std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize<>);
  var->SetShapes(dim_vec);

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
                              customized_name,                     \
                              customized_type_value,               \
@@ -298,6 +301,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
      __VA_ARGS__)
+#define REGISTER_OP_XPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, XPU, ::paddle::platform::XPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
 /**
 * Macro to mark what Operator and Kernel
 * we will use and tell the compiler to

--- a/paddle/fluid/framework/op_version_registry.cc
+++ b/paddle/fluid/framework/op_version_registry.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/op_version_registry.h"
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include <boost/any.hpp>
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+namespace compatible {
+struct OpUpdateRecord {
+  enum class Type { kInvalid = 0, kModifyAttr, kNewAttr };
+  Type type_;
+  std::string remark_;
+};
+struct ModifyAttr : OpUpdateRecord {
+  ModifyAttr(const std::string& name, const std::string& remark,
+             boost::any default_value)
+      : OpUpdateRecord({Type::kModifyAttr, remark}),
+        name_(name),
+        default_value_(default_value) {
+    // TODO(Shixiaowei02): Check the data type with proto::OpDesc.
+  }
+ private:
+  std::string name_;
+  boost::any default_value_;
+};
+struct NewAttr : OpUpdateRecord {
+  NewAttr(const std::string& name, const std::string& remark,
+          boost::any default_value)
+      : OpUpdateRecord({Type::kNewAttr, remark}),
+        name_(name),
+        default_value_(default_value) {}
+ private:
+  std::string name_;
+  boost::any default_value_;
+};
+class OpVersionDesc {
+ public:
+  OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark,
+                            boost::any default_value) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::ModifyAttr(name, remark, default_value)));
+    return *this;
+  }
+  OpVersionDesc& NewAttr(const std::string& name, const std::string& remark,
+                         boost::any default_value) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewAttr(name, remark, default_value)));
+    return *this;
+  }
+ private:
+  std::vector<std::shared_ptr<OpUpdateRecord>> infos_;
+};
+class OpVersion {
+ public:
+  OpVersion& AddCheckpoint(const std::string& note,
+                           const OpVersionDesc& op_version_desc) {
+    checkpoints_.push_back(Checkpoint({note, op_version_desc}));
+    return *this;
+  }
+ private:
+  struct Checkpoint {
+    std::string note_;
+    OpVersionDesc op_version_desc_;
+  };
+  std::vector<Checkpoint> checkpoints_;
+};
+class OpVersionRegistrar {
+ public:
+  static OpVersionRegistrar& GetInstance() {
+    static OpVersionRegistrar instance;
+    return instance;
+  }
+  OpVersion& Register(const std::string& op_type) {
+    if (op_version_map_.find(op_type) != op_version_map_.end()) {
+      PADDLE_THROW("'%s' is registered in operator version more than once.",
+                   op_type);
+    }
+    op_version_map_.insert({op_type, OpVersion()});
+    return op_version_map_[op_type];
+  }
+ private:
+  std::unordered_map<std::string, OpVersion> op_version_map_;
+  OpVersionRegistrar() = default;
+  OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
+};
+}  // namespace compatible
+}  // namespace framework
+}  // namespace paddle
+#define REGISTER_OP_VERSION(op_type)                                       \
+  static paddle::framework::compatible::OpVersion                          \
+      RegisterOpVersion__##op_type =                                       \
+          paddle::framework::compatible::OpVersionRegistrar::GetInstance() \
+              .Register(#op_type)
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_version_registry.h"
+namespace paddle {
+namespace framework {
+namespace compatible {
+TEST(test_operator_version, test_operator_version) {
+  REGISTER_OP_VERSION(test__)
+      .AddCheckpoint(
+          R"ROC(
+        Upgrade reshape, modified one attribute [axis] and add a new attribute [size].
+      )ROC",
+          framework::compatible::OpVersionDesc()
+              .ModifyAttr("axis",
+                          "Increased from the original one method to two.", -1)
+              .NewAttr("size",
+                       "In order to represent a two-dimensional rectangle, the "
+                       "parameter size is added.",
+                       0))
+      .AddCheckpoint(
+          R"ROC(
+        Add a new attribute [height]
+      )ROC",
+          framework::compatible::OpVersionDesc().NewAttr(
+              "height",
+              "In order to represent a two-dimensional rectangle, the "
+              "parameter height is added.",
+              0));
+}
+}  // namespace compatible
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -34,6 +34,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -165,6 +168,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
      platform::SetDeviceId(dev_id);
+#endif
+    } else if (platform::is_xpu_place(place)) {
+#ifndef PADDLE_WITH_XPU
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Cannot run operator on place %s", place));
+#else
+      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+      platform::SetXPUDeviceId(dev_id);
 #endif
    }
@@ -1109,6 +1120,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
    kernel_iter = kernels.find(expected_kernel_key);
  }
+#endif
+#ifdef PADDLE_WITH_XPU
+  if (kernel_iter == kernels.end() &&
+      is_xpu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing XPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
  if (kernel_iter == kernels.end()) {
    PADDLE_THROW("op %s does not have kernel for %s", type_,

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -449,6 +449,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                   const BuildStrategy &build_strategy,
                                   ir::Graph *graph)
    : member_(new ParallelExecutorPrivate(places, scope)) {
+  PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
+                 platform::errors::Unavailable(
+                     "XPU is not supported in ParallelExecutor"));
  ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                 member_->places_.size());
  member_->use_cuda_ = exec_strategy.use_cuda_;

--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -210,6 +210,23 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
      should_run.push_back(true);
    } else {
      should_run.push_back(false);
+      // If the output of an op modifies feed vars, the op should not clip.
+      // For example, in the transformer structure, the third parameter returned
+      // by beam_search op is generally assigned to a feed var. Cutting the
+      // assign op will cause an error.
+      if (parent_block_id != -1) {
+        bool flag = false;
+        for (auto& var : op_desc.outputs()) {
+          for (auto& argu : var.arguments()) {
+            if (feed_var_names.count(argu)) {
+              flag = true;
+            }
+          }
+        }
+        if (flag) {
+          should_run.back() = true;
+        }
+      }
    }
  }

--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -185,3 +185,34 @@ TEST(Prune, recurrrent_op) {
  EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
  EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
 }
+// If the output of an op modifies feed vars, the op should not clip.
+TEST(Prune, recurrrent_op_2) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::BlockDesc *sub_block = program.AppendBlock(*block);
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+        f::AttributeMap{}, block);
+  std::vector<std::string> state_var_name(1, "y");
+  AddOp("recurrent", {{"input", {"b", "c"}}}, {{"output", {"b1, c1"}}},
+        {{"ex_states", state_var_name},
+         {"states", state_var_name},
+         {"sub_block", sub_block}},
+        block);
+  EXPECT_TRUE(sub_block != nullptr);
+  AddOp("rnn_memory_helper", {{"input", {"x"}}}, {{"output", {"a"}}},
+        f::AttributeMap{}, sub_block);
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
+  f::proto::ProgramDesc pruned;
+  std::set<std::string> feed_var_names = {"x", "a"};
+  f::Prune(*pdesc, feed_var_names, &pruned);
+  EXPECT_EQ(pruned.blocks_size(), 2);
+  EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
+  EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
+}
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -76,6 +76,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
                      const platform::DeviceContext& dev_ctx,
                      const size_t& seek, const std::vector<int64_t>& shape);
+// store the bool result tensor in out tensor
+void TensorContainsNANV2(const framework::Tensor& tensor,
+                         framework::Tensor* out);
+void TensorContainsInfV2(const framework::Tensor& tensor,
+                         framework::Tensor* out);
+void TensorIsfiniteV2(const framework::Tensor& tensor, framework::Tensor* out);
 // convert dlpack's DLTensor to tensor
 void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst);

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -76,6 +76,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
    blas.AXPY(numel_, 1., x_, y_);
  }
+  void operator()(const platform::XPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
 #ifdef PADDLE_WITH_CUDA
  void operator()(const platform::CUDAPlace& place) {
    platform::CUDADeviceContext* ctx =

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -42,11 +42,6 @@ static void PrepareData(const platform::Place& place,
    for (const auto& var_base : name_pair.second) {
      const auto* tensor = GetTensorFromVar(var_base->Var());
      if (tensor && tensor->IsInitialized()) {
-        auto tmp_place = tensor->place();
-        // TODO(jiabin): Support transform data layout when we Verify it on more
-        // tests
-        if (!(tmp_place == place)) {
        auto kernel_type_for_var = op.GetKernelTypeForVar(
            name_pair.first, *tensor, expected_kernel_key);
        if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
@@ -62,7 +57,6 @@ static void PrepareData(const platform::Place& place,
      }
    }
  }
-  }
 }
 PreparedOp::PreparedOp(const framework::OperatorBase& op,
@@ -93,12 +87,26 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
  auto& kernels = kernels_iter->second;
  framework::RuntimeContext ctx({}, {});
+#ifdef PADDLE_WITH_MKLDNN
+  // MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
+  // GetKernelType functions, so we need to copy the attributes there.
+  // Const qualifier of Attrs had to be discarded to overwrite it.
+  auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
+  mutable_op_attrs = attrs;
+#endif
  auto expected_kernel_key =
      op.GetExpectedKernelType(DygraphExecutionContext<VarType>(
          op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs));
  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
  auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_XPU
+  if (kernel_iter == kernels.end() &&
+      is_xpu_place(expected_kernel_key.place_)) {
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                    platform::errors::NotFound(

--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -176,7 +176,7 @@ TEST(test_prepare_op, test_prepare_data) {
 }
 #endif
-TEST(test_prepare_op, test_prepare_data_same_place) {
+void TestPrepareDataSamePlace(framework::AttributeMap attr_map) {
  std::shared_ptr<imperative::VarBase> vin(
      new imperative::VarBase(false, "vin"));
  std::shared_ptr<imperative::VarBase> vout(
@@ -198,7 +198,6 @@ TEST(test_prepare_op, test_prepare_data_same_place) {
  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
  imperative::NameVarBaseMap ins = {x_pair};
  imperative::NameVarBaseMap outs = {out_pair};
-  framework::AttributeMap attr_map;
  const std::string op_type = "relu";
  const auto& info = framework::OpInfoMap::Instance().Get(op_type);
  if (info.Checker()) info.Checker()->Check(&attr_map);
@@ -222,8 +221,21 @@ TEST(test_prepare_op, test_prepare_data_same_place) {
    }
  }
 }
+TEST(test_prepare_op, test_prepare_data_same_place) {
+  TestPrepareDataSamePlace({});
+}
+#ifdef PADDLE_WITH_MKLDNN
+TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) {
+  TestPrepareDataSamePlace({{"use_mkldnn", true}});
+}
+#endif
 }  // namespace imperative
 }  // namespace paddle
 USE_OP(split);
 USE_OP(relu);
+#ifdef PADDLE_WITH_MKLDNN
+USE_OP_DEVICE_KERNEL(relu, MKLDNN);
+#endif
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -83,7 +83,12 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
    } else if (shape.size() == 3UL) {
      return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
    }
-    return nvinfer1::Dims4(shape[0], shape[1], 1, 1);
+    nvinfer1::Dims dims;
+    dims.nbDims = shape.size();
+    for (size_t i = 0; i < shape.size(); i++) {
+      dims.d[i] = shape[i];
+    }
+    return dims;
  }
 }
 }  // NOLINT

--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -24,6 +24,8 @@ struct SimpleOpTypeSetTeller : public Teller {
 #if IS_TRT_VERSION_GE(5130)
    teller_set.insert("relu6");
    teller_set.insert("hard_sigmoid");
+    int8_teller_set.insert("relu6");
+    int8_teller_set.insert("hard_sigmoid");
 #endif
 #if IS_TRT_VERSION_GE(6000)
    teller_set.insert("fused_embedding_eltwise_layernorm");
@@ -53,11 +55,11 @@ struct SimpleOpTypeSetTeller : public Teller {
                                                  "elementwise_add",
                                                  "leaky_relu",
                                                  "fc",
-                                                  "relu6",
                                                  "concat",
                                                  "scale",
                                                  "elementwise_mul",
-                                                  "conv2d_transpose"};
+                                                  "conv2d_transpose",
+                                                  "hard_swish"};
  std::unordered_set<std::string> teller_set{
      "mul",
      "conv2d",

--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -76,6 +76,16 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
  return ret;
 }
+template <typename T>
+void EmbEltwiseLayernormPluginDynamic<T>::terminate() {
+  for (auto ptr : embs_gpu_) {
+    if (ptr) cudaFree(ptr);
+  }
+  if (bias_gpu_) cudaFree(bias_gpu_);
+  if (scale_gpu_) cudaFree(scale_gpu_);
+}
 template <typename T>
 bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
    int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
@@ -153,7 +163,7 @@ int EmbEltwiseLayernormPluginDynamic<T>::enqueue(
  int64_t *emb_ptr_gpu_d =
      emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
-  std::vector<int64_t> in_ptr, emb_ptr;
+  std::vector<uintptr_t> in_ptr, emb_ptr;
  for (int i = 0; i < input_num; i++) {
    in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
    emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));

--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -81,9 +81,13 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
  }
  nvinfer1::IPluginV2DynamicExt* clone() const override {
-    return new EmbEltwiseLayernormPluginDynamic(
+    auto ptr = new EmbEltwiseLayernormPluginDynamic(
        embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
        eps_);
+    ptr->embs_gpu_ = embs_gpu_;
+    ptr->bias_gpu_ = bias_gpu_;
+    ptr->scale_gpu_ = scale_gpu_;
+    return ptr;
  }
  const char* getPluginType() const override {
@@ -111,6 +115,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
    return sum_num;
  }
+  void terminate() override;
  void serialize(void* buffer) const override {
    // SerializeValue(&buffer, with_fp16_);
    SerializeValue(&buffer, emb_sizes_);

--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -80,6 +80,12 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
 #if IS_TRT_VERSION_GE(6000)
+void PReluPluginDynamic::terminate() {
+  if (p_gpu_weight_) {
+    cudaFree(p_gpu_weight_);
+  }
+}
 int PReluPluginDynamic::initialize() {
  cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
  cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),

--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -102,12 +102,15 @@ class PReluPluginDynamic : public DynamicPluginTensorRT {
  }
  ~PReluPluginDynamic() { cudaFree(p_gpu_weight_); }
  nvinfer1::IPluginV2DynamicExt* clone() const override {
-    return new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
+    auto ptr = new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
+    ptr->p_gpu_weight_ = p_gpu_weight_;
+    return ptr;
  }
  const char* getPluginType() const override { return "prelu_plugin"; }
  int getNbOutputs() const override { return 1; }
  int initialize() override;
+  void terminate() override;
  size_t getSerializationSize() const override;
  void serialize(void* buffer) const override;

--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -51,8 +51,11 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
  }
  nvinfer1::IPluginV2DynamicExt* clone() const override {
-    return new SkipLayerNormPluginDynamic(
+    auto ptr = new SkipLayerNormPluginDynamic(
        bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
+    ptr->bias_gpu_ = bias_gpu_;
+    ptr->scale_gpu_ = bias_gpu_;
+    return ptr;
  }
  const char* getPluginType() const override { return "skip_layernorm_plugin"; }

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -471,19 +471,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
    endif()
-    inference_analysis_test(test_trt_dynamic_shape_ernie_serialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
+    inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
-    set(TEST_TRT_ERNIE_SER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_serialized/")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_SER_MODEL})
-        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_serialized.tgz")
-    endif()
-    inference_analysis_test(test_trt_dynamic_shape_ernie_deserialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_serialized)
 endif()
 set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")

--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -66,7 +66,7 @@ TEST(AnalysisPredictor, use_gpu) {
  float* data_o = static_cast<float*>(outputs[0].data.data());
  for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); j += 10) {
    EXPECT_NEAR((data_o[j] - truth_values[j / 10]) / truth_values[j / 10], 0.,
-                10e-5);
+                12e-5);
  }
 }

--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -123,8 +123,11 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
  config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false);
  config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
                                opt_input_shape);
+  AnalysisConfig* config_deser = new AnalysisConfig(config);
  std::vector<float> out_data;
-  run(config, &out_data);
+  run(config, &out_data);         // serialize
+  run(*config_deser, &out_data);  // deserialize
  for (size_t i = 0; i < out_data.size(); i++) {
    EXPECT_NEAR(result[i], out_data[i], 1e-6);
  }

--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -126,7 +126,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
  std::vector<float> out_data;
  run(config, &out_data);
  for (size_t i = 0; i < out_data.size(); i++) {
-    EXPECT_NEAR(result[i], out_data[i], 1e-6);
+    EXPECT_NEAR(result[i], out_data[i], 1e-5);
  }
 }

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -23,6 +23,8 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
 if (WITH_GPU)
    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
+elseif(WITH_XPU)
+    set(AllocatorFacadeDeps xpu_info)
 else ()
    set(AllocatorFacadeDeps)
 endif()

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -39,6 +39,9 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
 DEFINE_int64(
    gpu_allocator_retry_time, 10000,
@@ -62,6 +65,11 @@ class AllocatorFacadePrivate {
    switch (strategy) {
      case AllocatorStrategy::kNaiveBestFit: {
        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
@@ -74,6 +82,11 @@ class AllocatorFacadePrivate {
      case AllocatorStrategy::kAutoGrowth: {
        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
@@ -86,6 +99,11 @@ class AllocatorFacadePrivate {
      case AllocatorStrategy::kThreadLocal: {
        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
@@ -127,6 +145,13 @@ class AllocatorFacadePrivate {
 private:
  void InitSystemAllocators() {
    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::XPUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+    }
+#endif
 #ifdef PADDLE_WITH_CUDA
    system_allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<CPUPinnedAllocator>();
@@ -164,6 +189,12 @@ class AllocatorFacadePrivate {
  }
 #endif
+#ifdef PADDLE_WITH_XPU
+  void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
  class ZeroSizeAllocator : public Allocator {
   public:
    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
@@ -191,6 +222,12 @@ class AllocatorFacadePrivate {
    }
    places.emplace_back(platform::CUDAPinnedPlace());
 #endif
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      places.emplace_back(platform::XPUPlace(dev_id));
+    }
+#endif
    for (auto& p : places) {
      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -29,6 +29,9 @@
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
 DEFINE_bool(init_allocated_mem, false,
            "It is a mistake that the values of the memory allocated by "
@@ -101,6 +104,100 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
  return GetCPUBuddyAllocator()->Used();
 }
+template <>
+void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
+#ifdef PADDLE_WITH_XPU
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  void *p = nullptr;
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (FLAGS_init_allocated_mem) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "xpu memory FLAGS_init_allocated_mem is not implemented."));
+  }
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  VLOG(10) << "  pointer=" << p;
+  return p;
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+  return nullptr;
+#endif
+}
+template <>
+void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
+                              size_t size) {
+#ifdef PADDLE_WITH_XPU
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  xpu_free(p);
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
+template <>
+size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
+#ifdef PADDLE_WITH_XPU
+  printf("Used func return 0 for XPUPlace\n");
+  return 0;
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
 #ifdef PADDLE_WITH_CUDA
 class GPUBuddyAllocatorList {
 private:

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
 namespace paddle {
 namespace memory {
@@ -29,6 +33,169 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
  std::memcpy(dst, src, num);
 }
+#ifdef PADDLE_WITH_XPU
+template <>
+void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != dst_place.device) {
+    ret = xpu_set_device(dst_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id != dst_place.device) {
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+template <>
+void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::XPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != src_place.device) {
+    ret = xpu_set_device(src_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id != src_place.device) {
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+template <>
+void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::XPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != src_place.device || dev_id != dst_place.device) {
+    ret = xpu_set_device(src_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    void* tmp = malloc(num);
+    ret = xpu_memcpy(tmp, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_set_device(dst_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_memcpy(dst, tmp, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    free(tmp);
+  } else {
+    int ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+#endif
 #ifdef PADDLE_WITH_CUDA
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -88,7 +88,9 @@ endif()
 cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEPS operator)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor device_memory_aligment)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows
+lod_tensor maxouting unpooling pooling lod_rank_table context_project
+sequence_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
@@ -121,7 +123,7 @@ cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_t
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
-nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
+nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor generator)
 if (WITH_GPU)
    nv_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc test_leaky_relu_grad_grad_functor.cu DEPS tensor device_context eigen3)
 else()

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -219,7 +219,7 @@ $$out = \\frac{1}{\\sqrt{x}}$$
 )DOC";
 UNUSED constexpr char AbsDoc[] = R"DOC(
-Abs Activation Operator.
+Abs Operator.
 $$out = |x|$$
@@ -242,6 +242,9 @@ $$out = \\left \\lfloor x \\right \\rfloor$$
 UNUSED constexpr char CosDoc[] = R"DOC(
 Cosine Operator. Computes cosine of x element-wise.
+Input range is `(-inf, inf)` and output range is `[-1,1]`.
+Return `nan` if input is out of boundary.
 $$out = cos(x)$$
 )DOC";
@@ -314,13 +317,6 @@ $$out = x^2$$
 )DOC";
-UNUSED constexpr char SoftplusDoc[] = R"DOC(
-Softplus Activation Operator.
-$$out = \ln(1 + e^{x})$$
-)DOC";
 UNUSED constexpr char SoftsignDoc[] = R"DOC(
 Softsign Activation Operator.
@@ -334,7 +330,7 @@ class AcosOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of acos operator");
    AddOutput("Out", "Output of acos operator");
    AddComment(R"DOC(
-Arccosine Activation Operator.
+Arccosine Operator.
 $$out = \cos^{-1}(x)$$
@@ -348,7 +344,7 @@ class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of asin operator");
    AddOutput("Out", "Output of asin operator");
    AddComment(R"DOC(
-Arcsine Activation Operator.
+Arcsine Operator.
 $$out = \sin^{-1}(x)$$
@@ -362,9 +358,9 @@ class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of atan operator");
    AddOutput("Out", "Output of atan operator");
    AddComment(R"DOC(
-Arctanh Activation Operator.
+Arctangent Operator.
-$$out = \tanh^{-1}(x)$$
+$$out = \tan^{-1}(x)$$
 )DOC");
  }
@@ -393,6 +389,36 @@ $$out = \max(x, \alpha * x)$$
  }
 };
+class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "Input of Softplus operator, an N-D Tensor, with data type "
+             "float32, float64 or float16.");
+    AddOutput(
+        "Out",
+        "Output of Softplus operator, a Tensor with shape same as input.");
+    AddAttr<float>("beta", "The value of beta for Softplus.").SetDefault(1.0f);
+    AddAttr<float>("threshold", "The value of threshold for Softplus.")
+        .SetDefault(20.0f);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel.")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "use_cudnn",
+        "(bool, default false) Only used in cudnn kernel, need install cudnn.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+:strong:`Softplus Activation Operator`
+..  math::
+    out = \frac{1}{\beta} * \log(1 + \exp(\beta * x)) \\
+    \text{For numerical stability, the implementation reverts to the linear function when :}\,x \times \beta > threshold.
+)DOC");
+  }
+};
 class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
@@ -669,7 +695,6 @@ REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc);
 REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc);
 REGISTER_ACTIVATION_OP_MAKER(Log1p, Log1pDoc);
 REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
-REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
 REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
 template <ActBwdOpFwdDeps kDepValue>
@@ -756,8 +781,8 @@ class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
  }
 };
-// leaky_relu Grad: dx=dy if y>=0 else alpha * dy
+// leaky_relu Grad: dx=dy if x>=0 else alpha * dy
-// leaky_relu GradGrad: ddy=ddx if y>=0 else alpha * ddx
+// leaky_relu GradGrad: ddy=ddx if x>=0 else alpha * ddx
 template <typename T>
 class LeakyReluDoubleGradMaker
    : public ::paddle::framework::SingleGradOpMaker<T> {
@@ -767,8 +792,8 @@ class LeakyReluDoubleGradMaker
 protected:
  void Apply(GradOpPtr<T> op) const override {
    op->SetType("leaky_relu_grad_grad");
-    // input1: Out
+    // input1: X
-    op->SetInput("Out", this->Input("Out"));
+    op->SetInput("X", this->Input("X"));
    // X@GRAD@GRAD: ddx
    op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
    op->SetAttrMap(this->Attrs());

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -388,9 +388,9 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Out>
  void operator()(Device d, X x, Out out) const {
-    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>();
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
-    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>();
+    auto temp2 = x > static_cast<T>(threshold);
-    out.device(d) = x * (temp1 + temp2);
+    out.device(d) = x * (temp1 + temp2 > 0).template cast<T>();
  }
 };
@@ -405,9 +405,9 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>();
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
-    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>();
+    auto temp2 = x > static_cast<T>(threshold);
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2 > 0).template cast<T>();
  }
  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -975,32 +975,46 @@ struct HardSwishGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-// softplus(x) = log(1 + exp(x))
+// For numerical stability, using the following formula instead of softplus(x) =
-// When x is a very large positive number, exp(x) may explode to inf,
+// log(1 + exp(x))
-// Using trick below for numerical stability
+// softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <= threshold(beta =
-// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+// 1, threshold = 20 by default), otherwise x
-// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0)))
 template <typename T>
 struct SoftplusFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
  template <typename Device, typename X, typename Out>
  void operator()(Device d, X x, Out out) {
-    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
+    auto x_beta = static_cast<T>(beta) * x;
-    out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
+    out.device(d) = (x_beta > static_cast<T>(threshold))
+                        .select(x, (static_cast<T>(1) + x_beta.exp()).log() /
+                                       static_cast<T>(beta));
  }
 };
-// d(softplus(x))/dx = exp(x) / (1 + exp(x))
+// For numerical stability, using the following formula instead of
-// For numerical stability:
+// d(softplus(x))/dx = 1 / (1 + exp(-x))
-// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) +
+// d(softplus(x))/dx = 1 / (1 + exp(-beta * x)) when beta * x <= threshold(beta
-// exp(x - max(x, 0)))
+// = 1, threshold = 20 by default), otherwise x
 template <typename T>
 struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
-    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
+    auto x_beta = static_cast<T>(beta) * x;
    dx.device(d) =
-        dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
+        (x_beta > static_cast<T>(threshold))
+            .select(dout, dout / (static_cast<T>(1) + (-x_beta).exp()));
  }
  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -1070,7 +1084,11 @@ struct LeakyReluFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out>
  void operator()(Device d, X x, Out out) const {
+    if (alpha < 1.f) {
      out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+    } else {
+      out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
+    }
  }
 };
@@ -1084,12 +1102,12 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
    auto temp1 =
-        static_cast<T>(alpha) * (out <= static_cast<T>(0)).template cast<T>();
+        static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
-    auto temp2 = (out > static_cast<T>(0)).template cast<T>();
+    auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 template <typename T>
@@ -1116,9 +1134,20 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>() +
+    auto temp_a_pos = static_cast<T>(alpha > 0);
-                   dout * static_cast<T>(alpha) * x.exp() *
+    auto temp_a_neg = static_cast<T>(alpha <= 0);
-                       (x <= static_cast<T>(0)).template cast<T>();
+    auto temp_x_pos = (x > static_cast<T>(0)).template cast<T>();
+    auto temp_x_neg = (x <= static_cast<T>(0)).template cast<T>();
+    // dx = dout, if alpha > 0 and x > 0
+    // dx = dout * alpha * x.exp(), if alpha > 0 and x <= 0
+    // dx = dout * (1 + alpha * x.exp()), if alpha <= 0 and x > 0
+    // dx = 0, if alpha <= 0 and x <=0
+    dx.device(d) =
+        dout * temp_a_pos * temp_x_pos +
+        dout * static_cast<T>(alpha) * x.exp() * temp_a_pos * temp_x_neg +
+        dout * (static_cast<T>(1) + static_cast<T>(alpha) * x.exp()) *
+            temp_a_neg * temp_x_pos;
  }
  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -1437,18 +1466,18 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
      auto* d = dev.eigen_device();
      auto ddx = framework::EigenVector<T>::Flatten(
          GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
-      auto out = framework::EigenVector<T>::Flatten(
+      auto x = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(Out, "Output", "Out", "LeakyReluGradGrad"));
+          GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
      auto ddout = framework::EigenVector<T>::Flatten(
          GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
-      ddout.device(*d) = ddx *
+      ddout.device(*d) =
-                         ((out > static_cast<T>(0)).template cast<T>() +
+          ddx *
-                          static_cast<T>(alpha) *
+          ((x > static_cast<T>(0)).template cast<T>() +
-                              (out <= static_cast<T>(0)).template cast<T>())
+           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
              .template cast<T>();
    }
  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 template <typename T>

--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -28,10 +28,15 @@ using Tensor = framework::Tensor;
 template <typename T>
 struct Linspace<paddle::platform::CPUDeviceContext, T> {
-  void operator()(T start, T end, int count, framework::Tensor* numbers,
+  void operator()(T start, T end, int count, bool align_corners,
+                  framework::Tensor* numbers,
                  const framework::ExecutionContext& ctx) {
    T* number_data = numbers->mutable_data<T>({count}, platform::CPUPlace());
    T slice = (end - start) / (T)(count - 1);
+    if (!align_corners) {
+      slice = (end - start) / (T)count;
+      start *= (T)(count - 1) / (T)count;
+    }
    for (int i = 0; i < count; ++i) {
      number_data[i] = start + (T)i * slice;
    }
@@ -130,6 +135,10 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
        "use_cudnn",
        "(bool, default false) Only used in cudnn kernel, need install cudnn")
        .SetDefault(true);
+    AddAttr<bool>("align_corners",
+                  "(bool, default false) Whether to align the corners of input"
+                  "and ouput.")
+        .SetDefault(true);
    AddAttr<std::vector<int>>(
        "output_shape",
        "The target output image shape with format [N, C, H, W].")
@@ -164,10 +173,12 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
              [-1.  -0.5  0.   0.5  1. ]
              [-1.  -0.5  0.   0.5  1. ]
              [-1.  -0.5  0.   0.5  1. ]]]
-        C[0] is the coordinates in height axis and  C[1] is the coordinates in width axis.
+        C[0] is the coordinates in height axis and  C[1] is the coordinates in
+        width axis.
    Step2:
-        Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
+        Tanspose and reshape C to shape [H * W, 2] and append ones to last
+        dimension. The we get:
        C_ = [[-1.  -1.   1. ]
              [-0.5 -1.   1. ]
              [ 0.  -1.   1. ]

--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/affine_grid_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/gpu_info.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
+  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+}
+template <typename T>
+struct Linspace<paddle::platform::CUDADeviceContext, T> {
+  void operator()(T start, T end, int count, bool align_corners,
+                  framework::Tensor* numbers,
+                  const framework::ExecutionContext& ctx) {
+    T* number_data = numbers->mutable_data<T>({count}, ctx.GetPlace());
+    T slice = (end - start) / (T)(count - 1);
+    if (!align_corners) {
+      slice = (end - start) / (T)count;
+      start *= (T)(count - 1) / (T)count;
+    }
+    auto stream = ctx.cuda_device_context().stream();
+    int block = 512;
+    int grid = (count + block - 1) / block;
+    LinspaceKernel<T><<<grid, block, 0, stream>>>(start, slice, count,
+                                                  number_data);
+  }
+};
+template <typename T>
+__global__ void affine_grid_kernel(const int count, int n, int out_h, int out_w,
+                                   T h_start, T w_start, T h_step, T w_step,
+                                   const T* theta,  // N, 2, 3
+                                   T* output) {
+  CUDA_KERNEL_LOOP(index, count) {
+    int w = index % out_w;
+    int h = (index / out_w) % out_h;
+    int n = index / (out_w * out_h);
+    T h_coor = h_step * static_cast<T>(h) + static_cast<T>(h_start);
+    T w_coor = w_step * static_cast<T>(w) + static_cast<T>(w_start);
+    int theta_offset = n * 6;  // 2 * 3;
+    // affine from (h_coor, w_coor) to (x, y)
+    output[index * 2] = theta[theta_offset] * h_coor +
+                        theta[theta_offset + 1] * w_coor +
+                        theta[theta_offset + 2];
+    output[index * 2 + 1] = theta[theta_offset + 3] * h_coor +
+                            theta[theta_offset + 4] * w_coor +
+                            theta[theta_offset + 5];
+  }
+}
+template <typename T>
+__global__ void affine_grid_grad_kernel(const int count, int n, int out_h,
+                                        int out_w, T h_start, T w_start,
+                                        T h_step, T w_step,
+                                        const T* out_grad,  // N, H, W, 2
+                                        T* theta_grad) {    // N, 2, 3
+  CUDA_KERNEL_LOOP(index, count) {
+    int w = index % out_w;
+    int h = (index / out_w) % out_h;
+    int n = index / (out_w * out_h);
+    T h_coor = h_step * static_cast<T>(h) + static_cast<T>(h_start);
+    T w_coor = w_step * static_cast<T>(w) + static_cast<T>(w_start);
+    int theta_offset = n * 6;  // 2 * 3;
+    T out_grad_x = out_grad[index * 2];
+    atomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
+    atomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
+    atomicAdd(theta_grad + theta_offset + 2, out_grad_x);
+    T out_grad_y = out_grad[index * 2 + 1];
+    atomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
+    atomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
+    atomicAdd(theta_grad + theta_offset + 5, out_grad_y);
+  }
+}
+template <typename T>
+class AffineGridOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* theta = ctx.Input<Tensor>("Theta");
+    int n = theta->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+    auto* output = ctx.Output<Tensor>("Output");
+    T* out_data = output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
+    T h_step;
+    T w_step;
+    T h_start = -1;
+    T w_start = -1;
+    if (align_corners) {
+      h_step = static_cast<T>(2) / static_cast<T>(h - 1);
+      w_step = static_cast<T>(2) / static_cast<T>(w - 1);
+    } else {
+      h_step = static_cast<T>(2) / static_cast<T>(h);
+      w_step = static_cast<T>(2) / static_cast<T>(w);
+      h_start *= static_cast<T>(h - 1) / static_cast<T>(h);
+      w_start *= static_cast<T>(w - 1) / static_cast<T>(w);
+    }
+    const int count = n * h * w;
+    int block = 512;
+    int grid = (count + block - 1) / block;
+    auto cu_stream = ctx.cuda_device_context().stream();
+    affine_grid_kernel<<<grid, block, 0, cu_stream>>>(
+        count, n, h, w, h_start, w_start, h_step, w_step,
+        theta->data<T>(),  // N, 2, 3
+        out_data);
+  }
+};
+template <typename T>
+class AffineGridGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
+    int n = output_grad->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+    T* theta_grad_data = theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        ctx.cuda_device_context(), theta_grad, static_cast<T>(0));
+    T h_step;
+    T w_step;
+    T h_start = -1;
+    T w_start = -1;
+    if (align_corners) {
+      h_step = static_cast<T>(2) / static_cast<T>(h - 1);
+      w_step = static_cast<T>(2) / static_cast<T>(w - 1);
+    } else {
+      h_step = static_cast<T>(2) / static_cast<T>(h);
+      w_step = static_cast<T>(2) / static_cast<T>(w);
+      h_start *= static_cast<T>(h - 1) / static_cast<T>(h);
+      w_start *= static_cast<T>(w - 1) / static_cast<T>(w);
+    }
+    const int count = n * h * w;
+    VLOG(3) << "count: " << count << "; h_step: " << h_step
+            << "; w_step: " << w_step << "; h_start: " << h_start
+            << "; w_start: " << w_start;
+    int block = 512;
+    int grid = (count + block - 1) / block;
+    auto cu_stream = ctx.cuda_device_context().stream();
+    affine_grid_grad_kernel<<<grid, block, 0, cu_stream>>>(
+        count, n, h, w, h_start, w_start, h_step, w_step,
+        output_grad->data<T>(), theta_grad_data);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(affine_grid, ops::AffineGridOpCUDAKernel<float>,
+                        ops::AffineGridOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(affine_grid_grad,
+                        ops::AffineGridGradOpCUDAKernel<float>,
+                        ops::AffineGridGradOpCUDAKernel<double>);
--- a/paddle/fluid/operators/affine_grid_op.h
+++ b/paddle/fluid/operators/affine_grid_op.h
@@ -37,12 +37,13 @@ using Array4 = Eigen::DSizes<int64_t, 4>;
 */
 template <typename DeviceContext, typename T>
 struct Linspace {
-  void operator()(T start, T end, int count, framework::Tensor* numbers,
+  void operator()(T start, T end, int count, bool align_corners,
+                  framework::Tensor* numbers,
                  const framework::ExecutionContext& ctx);
 };
 template <typename DeviceContext, typename T>
-inline void GetIdxMap(int n, int h, int w, Tensor* grid,
+inline void GetIdxMap(int n, int h, int w, bool align_corners, Tensor* grid,
                      const framework::ExecutionContext& ctx) {
  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
  grid->mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
@@ -50,16 +51,19 @@ inline void GetIdxMap(int n, int h, int w, Tensor* grid,
  // Get indexes of height with shape [height, width, 1]
  Tensor h_idx;
  Linspace<DeviceContext, T> linspace;
-  linspace((T)-1, (T)1, h, &h_idx, ctx);
+  linspace((T)-1, (T)1, h, align_corners, &h_idx, ctx);
  auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
  // Get indexes of width with shape [height, width, 1]
  Tensor w_idx;
-  linspace((T)-1, (T)1, w, &w_idx, ctx);
+  linspace((T)-1, (T)1, w, align_corners, &w_idx, ctx);
  auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
  // Get constant ones tensor with shape [height, width, 1]
  Tensor ones;
  ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
-  auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
+  math::SetConstant<DeviceContext, T>()(
+      ctx.template device_context<DeviceContext>(), &ones, static_cast<T>(1));
+  auto ones_t = EigenTensor<T, 3>::From(ones);
  // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
  // ones
  Tensor w_idx_map;
@@ -74,11 +78,9 @@ inline void GetIdxMap(int n, int h, int w, Tensor* grid,
  Tensor w_h_one_idx_map;
  w_h_one_idx_map.mutable_data<T>({h, w, 3}, ctx.GetPlace());
  auto w_h_one_idx_map_t = EigenTensor<T, 3>::From(w_h_one_idx_map);
  w_idx_map_t.device(place) = w_idx_t.reshape(Array2(1, w))
                                  .broadcast(Array2(h, 1))
                                  .reshape(Array3(h, w, 1));
  h_idx_map_t.device(place) = h_idx_t.reshape(Array2(1, h))
                                  .broadcast(Array2(w, 1))
                                  .shuffle(Array2(1, 0))
@@ -97,6 +99,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
    auto* theta = ctx.Input<Tensor>("Theta");
    int n = theta->dims()[0];
    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
    int h = 0;
    int w = 0;
    if (size_attr.size() == 0) {
@@ -116,7 +119,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
        ctx.template device_context<DeviceContext>(), output,
        static_cast<T>(0));
    Tensor grid;
-    GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx);
+    GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
    // output = grid * theta.T
    // TODO(wanghaoshuang): Refine batched matrix multiply
    auto blas = math::GetBlas<DeviceContext, T>(ctx);
@@ -140,6 +143,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
    int n = output_grad->dims()[0];
    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
    int h = 0;
    int w = 0;
    if (size_attr.size() == 0) {
@@ -158,7 +162,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
        ctx.template device_context<DeviceContext>(), theta_grad,
        static_cast<T>(0));
    Tensor grid;
-    GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx);
+    GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
    // output = grid * theta.T
    // TODO(wanghaoshuang): Refine batched matrix multiply
    auto blas = math::GetBlas<DeviceContext, T>(ctx);

--- a/paddle/fluid/operators/allclose_op.cc
+++ b/paddle/fluid/operators/allclose_op.cc
@@ -22,9 +22,11 @@ namespace operators {
 class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("Input", "The first input tensor to compare.");
+    AddInput("Input",
-    AddInput("Other", "The second input tensor to compare.");
+             "The input tensor, it's data type should be float32, float64.");
-    AddOutput("Out", "The output tensor of allclose op.");
+    AddInput("Other",
+             "The input tensor, it's data type should be float32, float64.");
+    AddOutput("Out", "The output tensor, it's data type is bool.");
    AddAttr<float>("rtol", "The relative tolerance. Default: :math:`1e-5` .")
        .SetDefault(1e-5);
@@ -36,11 +38,12 @@ class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(false);
    AddComment(R"DOC( 
-This operator checks if all :math:`input` and :math:`other` satisfy the condition:
+This operator checks if all :math:`x` and :math:`y` satisfy the condition:
-:math:`\left| input - other \right| \leq atol + rtol \times \left| other \right|`
+.. math::
+    \left| x - y \right| \leq atol + rtol \times \left| y \right|
-elementwise, for all elements of :math:`input` and :math:`other`. The behaviour of this
+elementwise, for all elements of :math:`x` and :math:`y`. The behaviour of this
 operator is analogous to :math:`numpy.allclose`, namely that it returns :math:`True` if
 two tensors are elementwise equal within a tolerance.
 )DOC");

--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/fluid/operators/arg_max_op.cu
@@ -12,18 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
+#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
 REGISTER_OP_CUDA_KERNEL(
-    arg_max,
+    arg_max, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMax>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMax>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMax>,
-                                    double>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMax>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMax>);
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t>);
--- a/paddle/fluid/operators/arg_min_max_op_base.cu.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef __NVCC__
+#include <cub/cub.cuh>
+#include <limits>
+#include <string>
+#include <typeinfo>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace operators {
+namespace {  // NOLINT
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+using Tensor = framework::Tensor;
+}  // end namespace
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+template <typename T, typename IndType, class Reducer, size_t BlockDim>
+__global__ void ArgCUDAKernel(const int64_t height,     // n * h
+                              const int64_t width,      // c
+                              const int64_t post_size,  // h
+                              const Reducer reducer, const T init, const T* in,
+                              IndType* out) {
+  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      out[idx] = static_cast<IndType>(kv_pair.key);
+    }
+    __syncthreads();
+  }
+}
+template <typename T, typename IndType, class Reducer>
+void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
+                    Tensor* indices, const int64_t pre, const int64_t post,
+                    const int64_t n) {
+  auto cu_stream = ctx.stream();
+  auto ComputeBlockSize = [](int64_t col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256)
+      return 512;
+    else if (col > 128)
+      return 256;
+    else if (col > 64)
+      return 128;
+    else if (col > 32)
+      return 64;
+    else if (col > 16)
+      return 32;
+    else if (col > 8)
+      return 16;
+    else
+      return 8;
+  };
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
+  int64_t height = pre * post;
+  int64_t width = n;
+  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+  const T* in_data = input.data<T>();
+  IndType* out_data = indices->mutable_data<IndType>(ctx.GetPlace());
+  if (typeid(Reducer) == typeid(cub::ArgMax)) {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T, IndType, Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height, width, post, Reducer(), std::numeric_limits<T>::lowest(),
+              in_data, out_data));
+    }
+  } else {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T, IndType, Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height, width, post, Reducer(), std::numeric_limits<T>::max(),
+              in_data, out_data));
+    }
+  }
+}
+template <typename T, class Reducer>
+struct VisitDataCudaArgMinMaxFunctor {
+  const framework::ExecutionContext& ctx;
+  explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx)
+      : ctx(ctx) {}
+  template <typename IndType>
+  void apply() const {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    int axis = ctx.Attr<int64_t>("axis");
+    const bool& flatten = ctx.Attr<bool>("flatten");
+    framework::DDim input_dims;
+    if (flatten) {
+      input_dims = framework::make_ddim({input->numel()});
+      // if flatten, the axis just as 0
+      axis = 0;
+    } else {
+      input_dims = input->dims();
+      if (axis < 0) axis += input->dims().size();
+    }
+    int64_t numel = input->numel();
+    int64_t groups = numel / input_dims[axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+    const auto& dev_ctx = ctx.cuda_device_context();
+    ComputeFullArg<T, IndType, Reducer>(dev_ctx, *input, output, pre, post, n);
+  }
+};
+template <typename T, class Reducer>
+class ArgMinMaxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dtype = ctx.Attr<int>("dtype");
+    if (dtype < 0) {
+      framework::VisitDataType(static_cast<framework::proto::VarType::Type>(
+                                   framework::proto::VarType::INT64),
+                               VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
+      return;
+    }
+    framework::VisitDataType(
+        static_cast<framework::proto::VarType::Type>(dtype),
+        VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
+  }
+};
+#endif
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -38,8 +38,9 @@ struct ArgMinMaxFunctor {};
  struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
                          enum_argminmax_value> {                             \
    void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
-                    framework::LoDTensor* out, int64_t axis, bool keepdims) { \
+                    framework::LoDTensor* out, framework::DDim x_dims,        \
-      auto in_eigen = framework::EigenTensor<T, Rank>::From(in);              \
+                    int64_t axis, bool keepdims) {                            \
+      auto in_eigen = framework::EigenTensor<T, Rank>::From(in, x_dims);      \
      if (keepdims) {                                                         \
        auto out_eigen = framework::EigenTensor<Tout, Rank>::From(*out);      \
        out_eigen.device(*(ctx.eigen_device())) =                             \
@@ -68,16 +69,26 @@ struct VisitDataArgMinMaxFunctor {
    out.template mutable_data<Tout>(ctx.GetPlace());
    auto axis = ctx.Attr<int64_t>("axis");
    auto keepdims = ctx.Attr<bool>("keepdims");
-    auto x_rank = x.dims().size();
+    const bool& flatten = ctx.Attr<bool>("flatten");
-    if (axis < 0) axis += x_rank;
+    // if flatten, will construct the new dims for the cacluate
+    framework::DDim x_dims;
+    if (flatten) {
+      x_dims = framework::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) axis += x_dims.size();
+    }
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
 #define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
  ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
      functor##rank;                                                 \
-  functor##rank(dev_ctx, x, &out, axis, keepdims)
+  functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims)
-    switch (x.dims().size()) {
+    switch (x_dims.size()) {
      case 1:
        CALL_ARG_MINMAX_FUNCTOR(1);
        break;
@@ -141,6 +152,7 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
    const auto& x_dims = ctx->GetInputDim("X");
    int64_t axis = ctx->Attrs().Get<int64_t>("axis");
    bool keepdims = ctx->Attrs().Get<bool>("keepdims");
+    const bool& flatten = ctx->Attrs().Get<bool>("flatten");
    PADDLE_ENFORCE_GE(axis, -x_dims.size(),
                      platform::errors::InvalidArgument(
@@ -152,14 +164,21 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
        platform::errors::InvalidArgument(
            "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
+    std::vector<int64_t> vec;
+    if (flatten) {
+      // if is flatten, will return the only on element
+      if (keepdims) {
+        vec.emplace_back(static_cast<int64_t>(1));
+      }
+    } else {
      auto x_rank = x_dims.size();
      if (axis < 0) axis += x_rank;
-    std::vector<int64_t> vec;
+      for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
-    for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]);
      if (keepdims) {
-      vec.push_back(static_cast<int64_t>(1));
+        vec.emplace_back(static_cast<int64_t>(1));
+      }
+      for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
    }
-    for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
    ctx->SetOutputDim("Out", framework::make_ddim(vec));
  }
 };
@@ -176,6 +195,9 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
    AddAttr<bool>("keepdims", "Keep the dim that to reduce.").SetDefault(false);
    AddAttr<int>("dtype", "Keep the dim that to reduce.").SetDefault(-1);
+    AddAttr<bool>("flatten",
+                  "Flatten the input value, and search the min or max indices")
+        .SetDefault(false);
    AddComment(string::Sprintf(R"DOC(
      %s Operator.

--- a/paddle/fluid/operators/arg_min_op.cu
+++ b/paddle/fluid/operators/arg_min_op.cu
@@ -12,18 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
+#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
 REGISTER_OP_CUDA_KERNEL(
-    arg_min,
+    arg_min, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMin>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMin>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMin>,
-                                    double>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMin>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMin>);
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t>);
--- a/paddle/fluid/operators/bce_loss_op.cc
+++ b/paddle/fluid/operators/bce_loss_op.cc
@@ -32,22 +32,29 @@ class BCELossOp : public framework::OperatorWithKernel {
    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BCELoss");
    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
+    auto labels_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(), label_dims.size(),
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
                      platform::errors::InvalidArgument(
-            "Input(X) and Input(Label) shall have the same shape."));
+                          "Input(X) and Input(Label) shall have the same rank."
-    bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) ||
+                          "But received: the rank of Input(X) is [%d], "
-                               framework::contain_unknown_dim(label_dims);
+                          "the rank of Input(Label) is [%d].",
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
+                          rank, labels_dims.size()));
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
    if (check) {
-      PADDLE_ENFORCE_EQ(
+      PADDLE_ENFORCE_EQ(x_dims, labels_dims,
-          x_dims.size(), label_dims.size(),
                        platform::errors::InvalidArgument(
-              "ShapeError: Input(X) and Input(Label) shall have the same shape "
+                            "Input(X) and Input(Label) shall have the same "
-              "But received: the shape of Input(X) is [%s], the shape of "
+                            "shape. But received: the shape of Input(X) is "
-              "Input(Label) is [%s].",
+                            "[%s], the shape of Input(Label) is [%s].",
-              x_dims, label_dims));
+                            x_dims, labels_dims));
    }
    ctx->ShareDim("X", "Out");
@@ -76,20 +83,31 @@ class BCELossGradOp : public framework::OperatorWithKernel {
                   framework::GradVarName("X"), "BCELossGrad");
    auto x_dims = ctx->GetInputDim("X");
+    auto labels_dims = ctx->GetInputDim("Label");
    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) ||
-                               framework::contain_unknown_dim(dout_dims);
+    bool check = true;
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
    if (check) {
+      PADDLE_ENFORCE_EQ(x_dims, labels_dims,
+                        platform::errors::InvalidArgument(
+                            "Input(X) and Input(Label) shall have the same "
+                            "shape. But received: the shape of Input(X) is "
+                            "[%s], the shape of Input(Label) is [%s].",
+                            x_dims, labels_dims));
      PADDLE_ENFORCE_EQ(x_dims, dout_dims,
                        platform::errors::InvalidArgument(
-                            "ShapeError:The Input(X) and Input(Out@Grad) "
+                            "Input(X) and Input(Out@Grad) shall have the same "
-                            "should have the same "
+                            "shape. But received: the shape of Input(X) is "
-                            "shape, But received: the shape of Input(X) is "
+                            "[%s], the shape of Input(Out@Grad) is [%s].",
-                            "[%s], the shape of "
-                            "Input(Out@GRAD) is [%s].",
                            x_dims, dout_dims));
    }
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
    ctx->ShareLoD("X", framework::GradVarName("X"));
  }

--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -67,7 +67,8 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
    auto x_data = x->data<T>();
    auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    int x_numel = x->numel();
+    auto x_numel = x->numel();
    platform::GpuLaunchConfig config =
        platform::getGpuLaunchConfig(x_numel, ctx);
@@ -75,7 +76,7 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
    framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu);
    T* x_cpu_data = x_cpu.data<T>();
-    for (int i = 0; i < x_numel; ++i) {
+    for (int64_t i = 0; i < x_numel; ++i) {
      PADDLE_ENFORCE_GE(
          x_cpu_data[i], static_cast<T>(0),
          platform::errors::InvalidArgument(

--- a/paddle/fluid/operators/bce_loss_op.h
+++ b/paddle/fluid/operators/bce_loss_op.h
@@ -34,11 +34,11 @@ class BCELossOpKernel : public framework::OpKernel<T> {
    auto x_data = x->data<T>();
    auto label_data = labels->data<T>();
    auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    int x_numel = x->numel();
+    auto x_numel = x->numel();
    // out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 -
    // x) - label * ln(x)
-    for (int i = 0; i < x_numel; ++i) {
+    for (int64_t i = 0; i < x_numel; ++i) {
      PADDLE_ENFORCE_GE(
          x_data[i], static_cast<T>(0),
          platform::errors::InvalidArgument(

--- a/paddle/fluid/operators/bernoulli_op.cc
+++ b/paddle/fluid/operators/bernoulli_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/bernoulli_op.h"
+#include <algorithm>
+#include <string>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+namespace paddle {
+namespace operators {
+class BernoulliOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "A tensor with probabilities for generating the random binary "
+             "number");
+    AddOutput("Out", "A Tensor filled with random binary number");
+    AddComment(R"DOC(
+This OP returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
+    Out ~ Bernoulli(X)
+)DOC");
+  }
+};
+class BernoulliOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    return UnaryOpUnchangedInferShape(ctx);
+  }
+};
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class BernoulliOpKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto *in_data = x->data<T>();
+    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
+    int64_t size = x->numel();
+    std::uniform_real_distribution<T> dist(0.0, 1.0);
+    auto gen_ptr = framework::Generator::GetInstance();
+    std::mt19937_64 &gen_engine = gen_ptr->GetCPUEngine();
+    for (int64_t i = 0; i < size; ++i) {
+      out_data[i] = BernoulliFunctor(in_data[i], dist(gen_engine));
+    }
+  }
+};  // namespace operators
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OPERATOR(
+    bernoulli, ops::BernoulliOp, ops::BernoulliOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(bernoulli,
+                       ops::BernoulliOpKernel<plat::CPUDeviceContext, float>,
+                       ops::BernoulliOpKernel<plat::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/bernoulli_op.cu
+++ b/paddle/fluid/operators/bernoulli_op.cu
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <thrust/execution_policy.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/bernoulli_op.h"
+#include "paddle/fluid/platform/transform.h"
+namespace paddle {
+namespace operators {
+// it can be consistent with cpu when CUDAGenerator is provided.
+template <typename T>
+struct BernoulliCudaFunctor {
+  unsigned int seed_;
+  __host__ __device__ BernoulliCudaFunctor(int seed) : seed_(seed) {}
+  __host__ __device__ T operator()(const unsigned int n, const T p) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(0.0, 1.0);
+    rng.discard(n);
+    return static_cast<T>(dist(rng) < p);
+  }
+};
+template <typename T>
+class BernoulliOpKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::random_device rd;
+    auto seed = rd();
+    const auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto* in_data = x->data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    int64_t size = x->numel();
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    platform::Transform<platform::CUDADeviceContext> trans;
+    auto* context =
+        static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
+    trans(*context, index_sequence_begin, index_sequence_begin + size, in_data,
+          out_data, BernoulliCudaFunctor<T>(seed));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    bernoulli, ops::BernoulliOpKernel<plat::CUDADeviceContext, float>,
+    ops::BernoulliOpKernel<plat::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/bernoulli_op.h
+++ b/paddle/fluid/operators/bernoulli_op.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hostdevice.h"
+namespace paddle {
+namespace operators {
+/**
+ * Samples a bernoulli distribution given a probability input
+ */
+template <typename T>
+inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
+  PADDLE_ENFORCE_LE(p, 1, platform::errors::OutOfRange(
+                              "The probability should be <= 1, but got %f", p));
+  PADDLE_ENFORCE_GE(p, 0, platform::errors::OutOfRange(
+                              "The probability should be >= 1, but got %f", p));
+  return static_cast<T>(rand < p);
+}
+template <typename DeviceContext, typename T>
+class BernoulliOpKernel;
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/cholesky_op.cu
+++ b/paddle/fluid/operators/cholesky_op.cu
@@ -63,7 +63,6 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
      for_range(matrix_band_part_functor);
    }
-    // TODO(guosheng): Add callback to check info
    auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_count);
    auto* info_ptr = reinterpret_cast<int*>(info->ptr());
@@ -96,6 +95,20 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
 #if CUDA_VERSION >= 9020 && !defined(_WIN32)
    }
 #endif
+    // check the info
+    std::vector<int> error_info;  // only for checking positive matrix
+    error_info.resize(batch_count);
+    memory::Copy(platform::CPUPlace(), error_info.data(),
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 info_ptr, sizeof(int) * batch_count, dev_ctx.stream());
+    for (int i = 0; i < batch_count; ++i) {
+      PADDLE_ENFORCE_EQ(error_info[i], 0,
+                        platform::errors::PreconditionNotMet(
+                            "For batch [%d]: U(%d, %d) is zero, singular U.", i,
+                            error_info[i], error_info[i]));
+    }
  }
  void Potrf(const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,

--- a/paddle/fluid/operators/cholesky_op.h
+++ b/paddle/fluid/operators/cholesky_op.h
@@ -59,22 +59,24 @@ class CholeskyCPUKernel : public framework::OpKernel<T> {
            Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
            Eigen::UpLoType::Upper>
            llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(
+        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
-            llt_decomposition.info(), Eigen::Success,
                          platform::errors::InvalidArgument(
-                "Cholesky decomposition was not successful. The input matrice "
+                              "Cholesky decomposition was not successful. The "
-                "might not be not be positive definite."));
+                              "%d-th input matrice "
+                              "might not be not be positive definite.",
+                              i));
        output = llt_decomposition.matrixU();
      } else {
        Eigen::LLT<
            Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
            Eigen::UpLoType::Lower>
            llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(
+        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
-            llt_decomposition.info(), Eigen::Success,
                          platform::errors::InvalidArgument(
-                "Cholesky decomposition was not successful. The input matrice "
+                              "Cholesky decomposition was not successful. The "
-                "might not be not be positive definite."));
+                              "%d-th input matrice "
+                              "might not be not be positive definite.",
+                              i));
        output = llt_decomposition.matrixL();
      }
    }

--- a/paddle/fluid/operators/collective/c_reduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+namespace paddle {
+namespace operators {
+class CReduceMaxOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Max"; }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_max, ops::CReduceOp,
+                             ops::CReduceMaxOpMaker);
+REGISTER_OP_CPU_KERNEL(c_reduce_max,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, plat::float16>);
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(c_reduce_max,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, plat::float16>)
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+namespace paddle {
+namespace operators {
+class CReduceMinOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Min"; }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_min, ops::CReduceOp,
+                             ops::CReduceMinOpMaker);
+REGISTER_OP_CPU_KERNEL(c_reduce_min,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, plat::float16>);
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(c_reduce_min,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, plat::float16>)
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+namespace paddle {
+namespace operators {
+enum ReduceType { kRedSum, kRedMax, kRedMin, kRedProd };
+class CReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+template <ReduceType red_type, typename T>
+class CReduceOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        true, false,
+        platform::errors::Unavailable("Unimplemented CReduceOpCPUKernel now."));
+  }
+};
+template <ReduceType red_type, typename T>
+class CReduceOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto place = ctx.GetPlace();
+    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+    int rid = ctx.Attr<int>("ring_id");
+    int root = ctx.Attr<int>("root_id");
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    cudaStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    ncclRedOp_t nccl_red_type = ncclSum;
+    switch (red_type) {
+      case kRedSum:
+        nccl_red_type = ncclSum;
+        break;
+      case kRedMax:
+        nccl_red_type = ncclMax;
+        break;
+      case kRedMin:
+        nccl_red_type = ncclMin;
+        break;
+      case kRedProd:
+        nccl_red_type = ncclProd;
+        break;
+      default:
+        PADDLE_ENFORCE_EQ(true, false, platform::errors::InvalidArgument(
+                                           "red_type must be one of kRedSum, "
+                                           "kRedMax, kRedMin, kRedProd."));
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
+        sendbuff, recvbuff, numel, dtype, nccl_red_type, root, comm->comm(),
+        stream));
+#else
+    PADDLE_ENFORCE_EQ(true, false,
+                      platform::errors::Unavailable(
+                          "PaddlePaddle should compile with GPU.."));
+#endif
+  }
+};
+class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor), tensor to be reduced.");
+    AddOutput("Out", "(Tensor) the reduced result.");
+    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+CReduce %s Operator
+Call collective Reduce with reduce type %s. If input and output are
+the same variable, in-place reduce will be used.
+)DOC",
+                               GetName(), GetName()));
+  }
+ protected:
+  virtual std::string GetName() const = 0;
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+namespace paddle {
+namespace operators {
+class CReduceProdOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Prod"; }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_prod, ops::CReduceOp,
+                             ops::CReduceProdOpMaker);
+REGISTER_OP_CPU_KERNEL(c_reduce_prod,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, plat::float16>)
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(c_reduce_prod,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, plat::float16>)
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+namespace paddle {
+namespace operators {
+class CReduceSumOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Sum"; }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_sum, ops::CReduceOp,
+                             ops::CReduceSumOpMaker);
+REGISTER_OP_CPU_KERNEL(c_reduce_sum,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, plat::float16>)
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(c_reduce_sum,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, plat::float16>)
--- a/paddle/fluid/operators/collective/c_scatter_op.cc
+++ b/paddle/fluid/operators/collective/c_scatter_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/c_scatter_op.h"
+namespace paddle {
+namespace operators {
+class CScatterOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "CScatter");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CScatter");
+    int root_id = ctx->Attrs().Get<int>("root");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    int nranks = ctx->Attrs().Get<int>("nranks");
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The number of ranks (%d) must be greater than 1 "
+                          "to use collective op (c_scatter op).",
+                          nranks));
+    PADDLE_ENFORCE_GE(
+        root_id, 0,
+        platform::errors::InvalidArgument(
+            "The root_id (%d) for c_scatter_op must be non-negative.",
+            root_id));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_scatter_op must be non-negative.",
+            root_id));
+    framework::DDim dim = ctx->GetInputDim("X");
+    dim[0] = dim[0] / nranks;
+    if (dim[0] < 0) dim[0] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+class CScatterOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be broadcasted.");
+    AddOutput("Out", "(Tensor) the result of broadcast.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("root", "(int default 0) root id for broadcasting.")
+        .SetDefault(0);
+    AddAttr<int>("nranks", "(int default 1) number of ranks.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+CScatter Operator
+Scatter the source to all participators.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_WITHOUT_GRADIENT(c_scatter, ops::CScatterOp, ops::CScatterOpMaker);
+REGISTER_OP_CPU_KERNEL(c_scatter, ops::CScatterOpCPUKernel<float>,
+                       ops::CScatterOpCPUKernel<double>,
+                       ops::CScatterOpCPUKernel<int>,
+                       ops::CScatterOpCPUKernel<int64_t>,
+                       ops::CScatterOpCPUKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/c_scatter_op.h"
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+namespace paddle {
+namespace operators {
+template <typename T>
+class CScatterOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    int numel = x->numel();
+    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
+    int nranks = ctx.Attr<int>("nranks");
+    int root_id = ctx.Attr<int>("root");
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+    PADDLE_ENFORCE_EQ(nranks, comm->nranks(),
+                      platform::errors::InvalidArgument(
+                          "The number of ranks (%d) you set of must "
+                          "be equal to comm->nranks (%d).",
+                          nranks, comm->nranks()));
+    PADDLE_ENFORCE_GE(
+        root_id, 0,
+        platform::errors::InvalidArgument(
+            "The root_id (%d) for c_scatter_op must be non-negative.",
+            root_id));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_scatter_op must be non-negative.",
+            ring_id));
+    cudaStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    framework::DDim x_dims = x->dims();
+    framework::DDim out_dims(x_dims);
+    framework::Tensor temp;
+    auto out_ptr = temp.mutable_data<T>(out_dims, place);
+    if (root_id == comm->rank()) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype,
+          root_id, comm->comm(), stream));
+      framework::TensorCopy(*static_cast<const framework::Tensor*>(x), place,
+                            *platform::DeviceContextPool::Instance().Get(place),
+                            static_cast<framework::Tensor*>(&temp));
+    } else {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+          out_ptr, numel, dtype, root_id, comm->comm(), stream));
+    }
+    out_dims[0] = out_dims[0] / nranks;
+    auto start_index = out_dims[0] * comm->rank();
+    auto end_index = start_index + out_dims[0];
+    temp = temp.Slice(start_index, end_index);
+    temp.Resize(out_dims);
+    out->mutable_data<T>(out_dims, place);
+    framework::TensorCopySync(*static_cast<const framework::Tensor*>(&temp),
+                              place, static_cast<framework::Tensor*>(out));
+    out->Resize(out_dims);
+#else
+    PADDLE_ENFORCE_EQ(
+        true, false,
+        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(c_scatter, ops::CScatterOpCUDAKernel<float>,
+                        ops::CScatterOpCUDAKernel<double>,
+                        ops::CScatterOpCUDAKernel<int>,
+                        ops::CScatterOpCUDAKernel<int64_t>,
+                        ops::CScatterOpCUDAKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_scatter_op.h
+++ b/paddle/fluid/operators/collective/c_scatter_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class CScatterOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(true, false,
+                      platform::errors::Unavailable(
+                          "Unimplemented cpu kernel for CScatterOp."));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -196,7 +196,7 @@ framework::OpKernelType ConvOp::GetKernelTypeForVar(
    auto ar = paddle::framework::AttrReader(attrs);
    const std::string data_format = ar.Get<std::string>("data_format");
    auto dl = framework::StringToDataLayout(data_format);
-    // Some models may have intentionally set "AnyLayout" for pool
+    // Some models may have intentionally set "AnyLayout" for conv
    // op. Treat this as NCHW (default data_format value)
    if (dl != framework::DataLayout::kAnyLayout) {
      return framework::OpKernelType(expected_kernel_type.data_type_,

--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -37,6 +37,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
  auto filter_dims = ctx->GetInputDim("Filter");
  std::vector<int> output_size =
      ctx->Attrs().Get<std::vector<int>>("output_size");
+  std::vector<int> output_padding =
+      ctx->Attrs().Get<std::vector<int>>("output_padding");
  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
@@ -78,6 +80,12 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
        platform::errors::InvalidArgument(
            "The Attr(output_size) and Attr(stride) of Op(conv_transpose) "
            "should be the same."));
+  if (output_padding.size())
+    PADDLE_ENFORCE_EQ(
+        output_padding.size(), strides.size(),
+        platform::errors::InvalidArgument(
+            "The Attr(output_padding) and Attr(stride) of Op(conv_transpose) "
+            "should be the same."));
  const int64_t C =
      (data_layout != DataLayout::kNHWC ? in_dims[1]
@@ -136,6 +144,27 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                infer_shape + strides[i]));
      }
      output_shape.push_back(output_size[i]);
+    } else if (output_padding.size()) {
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_GE(
+            output_padding[i], 0,
+            platform::errors::InvalidArgument(
+                "output_padding of Op(ConvTransposeOp) should not be "
+                "less than the 0. But received output_padding = "
+                "[%s], whose dim %d is less than 0",
+                framework::make_ddim(output_padding), i));
+        PADDLE_ENFORCE_LT(
+            output_padding[i], std::max(strides[i], dilations[i]),
+            platform::errors::InvalidArgument(
+                "output_padding of Op(ConvTransposeOp) should be less "
+                "than either stride or dilation. But received output_size = "
+                "[%s], "
+                "whose dim %d is not less than either stride (%d)  or "
+                "dilation (%d)",
+                framework::make_ddim(output_size), i, strides[i],
+                dilations[i]));
+      }
+      output_shape.push_back((infer_shape + output_padding[i]));
    } else {
      output_shape.push_back(infer_shape);
    }
@@ -223,10 +252,14 @@ void Conv2DTransposeOpMaker::Make() {
           "The format of output tensor is X (one-dimensional) of size equal"
           "to the number of output channels. Only used with MKL-DNN.")
      .AsDispensable();
  AddOutput("Output",
            "(Tensor) The output tensor of convolution transpose operator. "
            "The format of output tensor is the same as input tensor.");
+  AddAttr<std::vector<int>>("output_padding",
+                            "(vector<int> default: []), Additional size added "
+                            "to one side of each dimension in the output "
+                            "shape")
+      .SetDefault({});
  AddAttr<std::vector<int>>("output_size",
                            "(vector<int> default: []), the "
                            "size of the output tensor")
@@ -338,6 +371,11 @@ void Conv3DTransposeOpMaker::Make() {
            "Where N is batch size, C is "
            "the number of channels, D is the depth of the feature, H is the "
            "height of the feature, and W is the width of the feature.");
+  AddAttr<std::vector<int>>("output_padding",
+                            "(vector<int> default: []), Additional size added "
+                            "to one side of each dimension in the output "
+                            "shape")
+      .SetDefault({});
  AddAttr<std::vector<int>>("output_size",
                            "(vector<int> default: []), the "
                            "size of the output tensor")

--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -24,34 +24,62 @@ class CudnnLSTMOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTM");
-                   "Input(Input) of LSTM should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTM");
-    PADDLE_ENFORCE(ctx->HasInput("W"),
+    OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTM");
-                   "Input(Weight) of LSTM should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTM");
-    PADDLE_ENFORCE(ctx->HasInput("InitH"),
+    OP_INOUT_CHECK(ctx->HasOutput("Reserve"), "Output", "Reserve", "CudnnLSTM");
-                   "Input(init_h) of LSTM should not be null.");
+    OP_INOUT_CHECK(ctx->HasOutput("StateOut"), "Output", "StateOut",
-    PADDLE_ENFORCE(ctx->HasInput("InitC"),
+                   "CudnnLSTM");
-                   "Input(init_c) of LSTM should not be null.");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CudnnLSTM");
-    PADDLE_ENFORCE(ctx->HasInput("Cache"),
+    OP_INOUT_CHECK(ctx->HasOutput("LastH"), "Output", "LastH", "CudnnLSTM");
-                   "Input(Cache) of LSTM should not be null.");
+    OP_INOUT_CHECK(ctx->HasOutput("LastC"), "Output", "LastC", "CudnnLSTM");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("last_h"),
-                   "Output(last_h) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("last_c"),
-                   "Output(last_c) of LSTM should not be null.");
    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 3, "Input(X)'s rank must be 3.");
+    auto init_dims = ctx->GetInputDim("InitH");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The rank of Input in CudnnLSTM  must be 3. But "
+                          "received Input's rank is %d.",
+                          in_dims.size()));
+    PADDLE_ENFORCE_EQ(init_dims.size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The rank of InitH in CudnnLSTM  must be 3. But "
+                          "received InitH's rank is %d.",
+                          init_dims.size()));
+    PADDLE_ENFORCE_EQ(in_dims[1], init_dims[1],
+                      platform::errors::InvalidArgument(
+                          "The in_dims[1] (Input dims) and init_dims[1] (InitH "
+                          "dims) should be equal. But "
+                          "received in_dims[1] is %d and init_dims[1] is %d.",
+                          in_dims[1], init_dims[1]));
+    PADDLE_ENFORCE_EQ(in_dims[2], init_dims[2],
+                      platform::errors::InvalidArgument(
+                          "The in_dims[2] (Input dims) and init_dims[2] (InitH "
+                          "dims) should be equal. But "
+                          "received in_dims[2] is %d and init_dims[2] is %d.",
+                          in_dims[2], init_dims[2]));
    auto out_dims = in_dims;
    auto hidden_size = ctx->Attrs().Get<int>("hidden_size");
-    out_dims[2] = hidden_size;
+    bool is_bidirec = ctx->Attrs().Get<bool>("is_bidirec");
+    out_dims[2] = is_bidirec ? hidden_size * 2 : hidden_size;
+    auto last_dims = init_dims;
+    last_dims[0] = is_bidirec ? last_dims[0] * 2 : last_dims[0];
    ctx->SetOutputDim("Out", out_dims);
-    ctx->SetOutputDim("last_h", ctx->GetInputDim("InitH"));
+    ctx->SetOutputDim("LastH", last_dims);
-    ctx->SetOutputDim("last_c", ctx->GetInputDim("InitC"));
+    ctx->SetOutputDim("LastC", last_dims);
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
+        ctx.device_context());
  }
 };
@@ -84,33 +112,31 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
             "(Tensor) the learnable hidden-hidden weights."
             " The shape is (N), where N is total weight size of the LSTM. "
             " cudnn concatenate all the weight to one Tensor");
-    AddInput("Cache",
+    AddOutput("Reserve",
-             "The cache of dropout op, a RAW type variable including random "
+              "(Tensor, a temporary output Tensor to store the reserve_data "
-             "number generator states and some descriptors, which is used in "
+              "of cudnn kernel.")
-             "cudnn kernel.")
+        .AsIntermediate();
-        .AsDispensable();
+    AddOutput("StateOut",
+              "Share memory with State. "
+              "Store the global drop state when training");
    AddOutput("Out",
              "(Tensor) the hidden state of LSTM operator. "
              "The shape is ( seq_len x batch_size x hidden_size) if "
              "is_bidirec is False"
              "and When is_bidirec is True, the shape will be ( seq_len x "
              "batch_size x hidden_size * 2) ");
-    AddOutput("last_h",
+    AddOutput("LastH",
              "(Tensor) the hidden state of the last step. "
              "The shape is ( num_layers x batch_size x hidden_size) if "
              "is_bidirec is False"
              "and When is_bidirec is True, the shape will be (num_layers*2 x "
              "batch_size x hidden_size)");
-    AddOutput("last_c",
+    AddOutput("LastC",
              "(Tensor) the cell state of the last step"
              "The shape is ( num_layers x batch_size x hidden_size) if "
              "is_bidirec is False"
              "and When is_bidirect is True, the shape will be (num_layers*2 x "
              "batch_size x hidden_size*2)");
-    AddAttr<int>("max_len",
-                 "max length of the LSTM op"
-                 "the first dim of the Input can NOT be greater than max_len")
-        .SetDefault(20);
    AddAttr<float>(
        "dropout_prob",
        "dropout prob of the dropout op"
@@ -120,14 +146,14 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<bool>("is_bidirec",
                  "is_bidirec"
                  "if it is bidirectional rnn"
-                  "The will affect the shape of the Out, last_h, and last_c")
+                  "The will affect the shape of the Out, LastH, and LastC")
        .SetDefault(false);
    AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
    AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100);
    AddAttr<int>("num_layers", "the total layer number of the LSTM")
        .SetDefault(1);
    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
-    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(-1);
+    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
    AddComment(R"DOC(
 CUDNN LSTM implementation
@@ -172,16 +198,10 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTMGrad");
-                   "Input(Input) of LSTM should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTMGrad");
-    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTMGrad");
-    PADDLE_ENFORCE(ctx->HasInput("Cache"),
+    OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTMGrad");
-                   "Input(last_c) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("InitH"),
-                   "Input(init_h) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("InitC"),
-                   "Input(init_c) of LSTM should not be null.");
    auto SetOutGradDim = [&ctx](const std::string& name) {
      auto g_name = framework::GradVarName(name);
@@ -195,6 +215,12 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
    SetOutGradDim("InitH");
    SetOutGradDim("InitC");
  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
 };
 template <typename T>
@@ -209,13 +235,12 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
    op->SetInput("InitH", this->Input("InitH"));
    op->SetInput("InitC", this->Input("InitC"));
    op->SetInput("W", this->Input("W"));
-    if (this->HasInput("Cache")) {
+    op->SetInput("Reserve", this->Output("Reserve"));
-      op->SetInput("Cache", this->Input("Cache"));
+    op->SetInput("StateOut", this->Output("StateOut"));
-    }
    op->SetInput("Out", this->Output("Out"));
    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput(framework::GradVarName("last_c"), this->OutputGrad("last_c"));
+    op->SetInput(framework::GradVarName("LastC"), this->OutputGrad("LastC"));
-    op->SetInput(framework::GradVarName("last_h"), this->OutputGrad("last_h"));
+    op->SetInput(framework::GradVarName("LastH"), this->OutputGrad("LastH"));
    op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
    op->SetOutput(framework::GradVarName("W"), this->InputGrad("W"));

--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cudnn_desc.h"
 namespace paddle {
 namespace operators {
@@ -33,8 +34,10 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
    auto w = ctx.Input<Tensor>("W");
    Tensor *out = ctx.Output<Tensor>("Out");
-    Tensor *last_h = ctx.Output<Tensor>("last_h");
+    Tensor *last_h = ctx.Output<Tensor>("LastH");
-    Tensor *last_c = ctx.Output<Tensor>("last_c");
+    Tensor *last_c = ctx.Output<Tensor>("LastC");
+    Tensor *reserve = ctx.Output<Tensor>("Reserve");
+    Tensor *state_out = ctx.Output<Tensor>("StateOut");
    const T *x_data = x->data<T>();
    const T *init_h_data = init_h->data<T>();
@@ -46,72 +49,56 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
    T *last_h_data = last_h->mutable_data<T>(ctx.GetPlace());
    T *last_c_data = last_c->mutable_data<T>(ctx.GetPlace());
-    size_t max_len = ctx.Attr<int>("max_len");
    float dropout_prob = ctx.Attr<float>("dropout_prob");
    bool is_bidirec = ctx.Attr<bool>("is_bidirec");
-    int input_size = ctx.Attr<int>("input_size");
    int hidden_size = ctx.Attr<int>("hidden_size");
    int num_layers = ctx.Attr<int>("num_layers");
    bool is_test = ctx.Attr<bool>("is_test");
+    int seed = ctx.Attr<int>("seed");
    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
-    auto *cache_var = ctx.InputVar("Cache");
-    if (!cache_var) {
+    CudnnRNNCache *cudnn_rnn_cache = new CudnnRNNCache();
-      // The RAW type cache variable wouldn't be created and broadcasted on
-      // multi-devices before the first running.
-      // use parent scope to make cache persistable
-      auto *scope = const_cast<framework::Scope *>(ctx.scope().parent());
-      auto cache_var_name = ctx.InputNames("Cache")[0];
-      cache_var = scope->Var(cache_var_name);
-    }
-    CudnnRNNCache *cudnn_rnn_cache = nullptr;
-    if (cache_var->IsInitialized()) {
-      // const_cast is usually bad.
-      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
-                            ->GetMutable<CudnnRNNCache>();
-    } else {
-      // const_cast is usually bad.
-      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
-                            ->GetMutable<CudnnRNNCache>();
-      std::random_device rnd;
-      int seed = ctx.Attr<int>("seed");
-      if (seed == -1) {
-        seed = rnd();
-      }
    auto input_w_numel = w->numel();
+    auto seq_len = x->dims()[0];
    auto batch_size = x->dims()[1];
-      cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size,
+    auto input_dim = x->dims()[2];
-                            input_size, hidden_size, num_layers, dropout_prob,
+    size_t reserve_size;
-                            is_bidirec, seed, input_w_numel);
+    bool state_initialized = state_out->IsInitialized() ? true : false;
-    }
+    cudnnDataType_t cudnn_type = platform::ToCudnnDataType(
+        framework::ToDataType(std::type_index(typeid(T))));
-    auto run_seq_len = x->dims()[0];
+    cudnn_rnn_cache->init(handle, ctx.GetPlace(), seq_len, batch_size,
+                          input_dim, hidden_size, num_layers, dropout_prob,
+                          is_bidirec, seed, input_w_numel, &reserve_size,
+                          state_out, state_initialized, cudnn_type);
+    auto *reserve_data = reserve->mutable_data<uint8_t>(
+        {static_cast<int64_t>(reserve_size)}, ctx.GetPlace());
    if (is_test) {
      // for inference
      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
-          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+          handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
-          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
+          x_data, cudnn_rnn_cache->hx_desc_, init_h_data,
-          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
+          cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->w_desc_,
-          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
+          w_data, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->hy_desc_,
-          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
+          last_h_data, cudnn_rnn_cache->cy_desc_, last_c_data,
-          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+          cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
          cudnn_rnn_cache->workspace_size_));
    } else {
      // for train
      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
-          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+          handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
-          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
+          x_data, cudnn_rnn_cache->hx_desc_, init_h_data,
-          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
+          cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->w_desc_,
-          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
+          w_data, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->hy_desc_,
-          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
+          last_h_data, cudnn_rnn_cache->cy_desc_, last_c_data,
-          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+          cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-          cudnn_rnn_cache->workspace_size_,
+          cudnn_rnn_cache->workspace_size_, reserve_data, reserve_size));
-          cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
-          cudnn_rnn_cache->reserve_size_));
    }
+    delete cudnn_rnn_cache;
  }
 };
@@ -123,15 +110,13 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
    auto *weight = ctx.Input<Tensor>("W");
    auto *init_h = ctx.Input<Tensor>("InitH");
    auto *init_c = ctx.Input<Tensor>("InitC");
-    // auto * last_h = ctx.Input<Tensor>("last_h");
+    auto *reserve = ctx.Input<Tensor>("Reserve");
-    // auto * last_c = ctx.Input<Tensor>("last_c");
+    auto *state_out = ctx.Input<Tensor>("StateOut");
    auto *out = ctx.Input<Tensor>("Out");
    auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("last_h"));
+    auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("LastH"));
-    auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("last_c"));
+    auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("LastC"));
-    // auto* init_h = ctx.Input<Tensor>("init_h");
-    // auto* init_c = ctx.Input<Tensor>("init_c");
    auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
    auto *weight_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
@@ -140,116 +125,75 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
-    auto *cache_var = ctx.InputVar("Cache");
-    PADDLE_ENFORCE(cache_var->IsInitialized());
-    CudnnRNNCache *cudnn_rnn_cache =
-        const_cast<framework::Variable *>(cache_var)
-            ->GetMutable<CudnnRNNCache>();
    auto input_dims = input->dims();
    auto init_h_dims = init_h->dims();
    auto init_c_dims = init_c->dims();
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    weight_grad->mutable_data<T>(ctx.GetPlace());
+    auto *weight_data = weight->data<T>();
+    auto *init_h_data = init_h->data<T>();
+    auto *init_c_data = init_c->data<T>();
+    auto *out_data = out->data<T>();
+    auto *out_grad_data = out_grad->data<T>();
+    auto *last_h_grad_data = last_h_grad->data<T>();
+    auto *last_c_grad_data = last_c_grad->data<T>();
    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
-    zero(dev_ctx, in_grad, static_cast<T>(0.0));
+    weight_grad->mutable_data<T>(ctx.GetPlace());
    zero(dev_ctx, weight_grad, static_cast<T>(0.0));
-    T *init_h_grad_data = NULL;
+    in_grad->mutable_data<T>(input_dims, ctx.GetPlace());
-    if (init_h_grad == nullptr) {
+    auto *in_grad_data = in_grad->data<T>();
-      Tensor init_h_grad_temp;
-      init_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
-      zero(dev_ctx, &init_h_grad_temp, static_cast<T>(0.0));
-      init_h_grad_data = init_h_grad_temp.data<T>();
-    } else {
    init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
-      zero(dev_ctx, init_h_grad, static_cast<T>(0.0));
+    auto *init_h_grad_data = init_h_grad->data<T>();
-      init_h_grad_data = init_h_grad->data<T>();
-    }
-    T *init_c_grad_data = NULL;
-    if (init_c_grad == nullptr) {
-      Tensor init_c_grad_temp;
-      init_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
-      zero(dev_ctx, &init_c_grad_temp, static_cast<T>(0.0));
-      init_c_grad_data = init_c_grad_temp.data<T>();
-    } else {
    init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
-      zero(dev_ctx, init_c_grad, static_cast<T>(0.0));
+    auto *init_c_grad_data = init_c_grad->data<T>();
-      init_c_grad_data = init_c_grad->data<T>();
-    }
-    const T *last_h_grad_data = NULL;
-    if (last_h_grad == nullptr) {
-      Tensor last_h_grad_temp;
-      last_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
-      zero(dev_ctx, &last_h_grad_temp, static_cast<T>(0.0));
-      last_h_grad_data = (const T *)last_h_grad_temp.data<T>();
-    } else {
-      last_h_grad_data = last_h_grad->data<T>();
-    }
-    const T *last_c_grad_data = NULL;
-    if (last_c_grad == nullptr) {
-      Tensor last_c_grad_temp;
-      last_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
-      zero(dev_ctx, &last_c_grad_temp, static_cast<T>(0.0));
-      last_c_grad_data = (const T *)last_c_grad_temp.data<T>();
-    } else {
-      last_c_grad_data = last_c_grad->data<T>();
-    }
-    const T *out_grad_data = NULL;
-    if (out_grad == nullptr) {
-      Tensor out_grad_temp;
-      out_grad_temp.mutable_data<T>(out->dims(), ctx.GetPlace());
-      zero(dev_ctx, &out_grad_temp, static_cast<T>(0.0));
-      out_grad_data = (const T *)out_grad_temp.data<T>();
+    float dropout_prob = ctx.Attr<float>("dropout_prob");
-    } else {
+    bool is_bidirec = ctx.Attr<bool>("is_bidirec");
-      out_grad_data = out_grad->data<T>();
+    int hidden_size = ctx.Attr<int>("hidden_size");
-    }
+    int num_layers = ctx.Attr<int>("num_layers");
+    int seed = ctx.Attr<int>("seed");
-    // zero( dev_ctx, last_h_grad, static_cast<T>(0.0));
+    CudnnRNNCache *cudnn_rnn_cache = new CudnnRNNCache();
-    // zero( dev_ctx, last_c_grad, static_cast<T>(0.0));
-    auto out_data = out->data<T>();
+    auto input_w_numel = weight->numel();
-    // auto out_grad_data = out_grad->data<T>();
+    auto seq_len = input_dims[0];
-    auto weight_data = weight->data<T>();
+    auto batch_size = input->dims()[1];
-    auto init_h_data = init_h->data<T>();
+    auto input_dim = input->dims()[2];
-    auto init_c_data = init_c->data<T>();
+    size_t reserve_size;
-    auto in_grad_data = in_grad->data<T>();
+    cudnnDataType_t cudnn_type = platform::ToCudnnDataType(
+        framework::ToDataType(std::type_index(typeid(T))));
+    cudnn_rnn_cache->init(handle, ctx.GetPlace(), seq_len, batch_size,
+                          input_dim, hidden_size, num_layers, dropout_prob,
+                          is_bidirec, seed, input_w_numel, &reserve_size,
+                          const_cast<Tensor *>(state_out), true, cudnn_type);
    auto work_data = cudnn_rnn_cache->workspace_data_.data<uint8_t>();
-    auto reserve_data = cudnn_rnn_cache->reserve_data_.data<uint8_t>();
+    const uint8_t *reserve_data = reserve->data<uint8_t>();
-    auto run_seq_len = input_dims[0];
-    PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_,
-                      "cudnn running seq_len CAN not greater max_lengh");
    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
-        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+        handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->y_desc_,
-        cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_,
+        out_data, cudnn_rnn_cache->y_desc_, out_grad_data,
-        out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data,
+        cudnn_rnn_cache->hy_desc_, last_h_grad_data, cudnn_rnn_cache->cy_desc_,
-        cudnn_rnn_cache->dcy_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_,
+        last_c_grad_data, cudnn_rnn_cache->w_desc_, weight_data,
-        weight_data, cudnn_rnn_cache->hx_desc_, init_h_data,
+        cudnn_rnn_cache->hx_desc_, init_h_data, cudnn_rnn_cache->cx_desc_,
-        cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->dx_desc_,
+        init_c_data, cudnn_rnn_cache->x_desc_, in_grad_data,
-        in_grad_data, cudnn_rnn_cache->dhx_desc_, init_h_grad_data,
+        cudnn_rnn_cache->hx_desc_, init_h_grad_data, cudnn_rnn_cache->cx_desc_,
-        cudnn_rnn_cache->dcx_desc_, init_c_grad_data, work_data,
+        init_c_grad_data, work_data, cudnn_rnn_cache->workspace_size_,
-        cudnn_rnn_cache->workspace_size_, reserve_data,
+        const_cast<uint8_t *>(reserve_data), reserve_size));
-        cudnn_rnn_cache->reserve_size_));
    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
-        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+        handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
-        cudnn_rnn_cache->x_desc_, input->data<T>(), cudnn_rnn_cache->hx_desc_,
+        input->data<T>(), cudnn_rnn_cache->hx_desc_, init_h->data<T>(),
-        init_h->data<T>(), cudnn_rnn_cache->y_desc_, out->data<T>(),
+        cudnn_rnn_cache->y_desc_, out->data<T>(),
        cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-        cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->dw_desc_,
+        cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->w_desc_,
-        weight_grad->data<T>(), cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
+        weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
-        cudnn_rnn_cache->reserve_size_));
+        reserve_size));
+    delete cudnn_rnn_cache;
  }
 };
@@ -257,5 +201,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>);
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>,
-REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>);
+                        ops::CudnnLSTMGPUKernel<double>);
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>,
+                        ops::CudnnLSTMGPUGradKernel<double>);
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
--- a/paddle/fluid/operators/cvm_op.h
+++ b/paddle/fluid/operators/cvm_op.h
@@ -68,9 +68,20 @@ class CVMOpKernel : public framework::OpKernel<T> {
    // for Input X do not have Lod Information.
    if (x->NumLevels() == 0) {
+      if (use_cvm) {
+        for (int i = 0; i < batch_size; i++) {
+          int cursor = i * item_size;
+          y_data[cursor] = log(x_data[cursor] + 1);
+          y_data[cursor + 1] = log(x_data[cursor + 1] + 1) - y_data[cursor];
+          for (int j = 2; j < item_size; j++) {
+            y_data[cursor + j] = x_data[cursor + j];
+          }
+        }
+      } else {
        for (int i = 0; i < batch_size; i++) {
          CvmComputeKernel(use_cvm, item_size, &x_data, &y_data);
        }
+      }
    } else {
      auto lod = x->lod()[0];
      for (size_t i = 0; i < lod.size() - 1; ++i) {

--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -222,6 +222,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
    } else {
      auto lod = dist_mat->lod().back();
      for (size_t i = 0; i < lod.size() - 1; ++i) {
+        if (lod[i + 1] > lod[i]) {
          Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
          BipartiteMatch(one_ins, indices + i * col, dist + i * col);
          if (type == "per_prediction") {
@@ -230,6 +231,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
        }
      }
    }
+  }
 };
 class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {

--- a/paddle/fluid/operators/diag_v2_op.cc
+++ b/paddle/fluid/operators/diag_v2_op.cc
--- a/paddle/fluid/operators/diag_v2_op.cu
+++ b/paddle/fluid/operators/diag_v2_op.cu
--- a/paddle/fluid/operators/diag_v2_op.h
+++ b/paddle/fluid/operators/diag_v2_op.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+using DDim = framework::DDim;
+static inline int ComputeStride(int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -61,7 +61,7 @@ cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
-cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv)
+cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator)
 cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
 if(WITH_GPU)
    cc_test(collective_server_test SRCS collective_server_test.cc 

--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ b/paddle/fluid/operators/distributed/large_scale_kv.h
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
--- a/paddle/fluid/operators/distributed_ops/recv_save_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
@@ -44,7 +44,7 @@ class RecvSaveOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
+        platform::CPUPlace());
  }
 };

--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
--- a/paddle/fluid/operators/expand_as_v2_op.cu
+++ b/paddle/fluid/operators/expand_as_v2_op.cu
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
--- a/paddle/fluid/operators/expand_v2_op.cu
+++ b/paddle/fluid/operators/expand_v2_op.cu
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
--- a/paddle/fluid/operators/huber_loss_op.cu
+++ b/paddle/fluid/operators/huber_loss_op.cu
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
--- a/paddle/fluid/operators/isfinite_v2_op.cu
+++ b/paddle/fluid/operators/isfinite_v2_op.cu
--- a/paddle/fluid/operators/isfinite_v2_op.h
+++ b/paddle/fluid/operators/isfinite_v2_op.h
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
--- a/paddle/fluid/operators/log_softmax_op.cc
+++ b/paddle/fluid/operators/log_softmax_op.cc
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
--- a/paddle/fluid/operators/log_softmax_op.h
+++ b/paddle/fluid/operators/log_softmax_op.h
--- a/paddle/fluid/operators/masked_select_op.cc
+++ b/paddle/fluid/operators/masked_select_op.cc
--- a/paddle/fluid/operators/masked_select_op.cu
+++ b/paddle/fluid/operators/masked_select_op.cu
--- a/paddle/fluid/operators/masked_select_op.h
+++ b/paddle/fluid/operators/masked_select_op.h
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
--- a/paddle/fluid/operators/matmul_v2_op.cu
+++ b/paddle/fluid/operators/matmul_v2_op.cu
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/fluid/operators/nll_loss_op.cu
--- a/paddle/fluid/operators/nll_loss_op.h
+++ b/paddle/fluid/operators/nll_loss_op.h
--- a/paddle/fluid/operators/p_norm_op.cc
+++ b/paddle/fluid/operators/p_norm_op.cc
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
--- a/paddle/fluid/operators/p_norm_op.h
+++ b/paddle/fluid/operators/p_norm_op.h
--- a/paddle/fluid/operators/pad3d_op.cc
+++ b/paddle/fluid/operators/pad3d_op.cc
--- a/paddle/fluid/operators/pad3d_op.cu
+++ b/paddle/fluid/operators/pad3d_op.cu
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
--- a/paddle/fluid/operators/pixel_shuffle_op.h
+++ b/paddle/fluid/operators/pixel_shuffle_op.h
--- a/paddle/fluid/operators/randint_op.cc
+++ b/paddle/fluid/operators/randint_op.cc
--- a/paddle/fluid/operators/randperm_op.h
+++ b/paddle/fluid/operators/randperm_op.h
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.h
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
--- a/paddle/fluid/operators/size_op.cu
+++ b/paddle/fluid/operators/size_op.cu
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
--- a/paddle/fluid/operators/tile_op.cu
+++ b/paddle/fluid/operators/tile_op.cu
--- a/paddle/fluid/operators/tile_op.h
+++ b/paddle/fluid/operators/tile_op.h
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
--- a/paddle/fluid/operators/xpu/mul_xpu_op.cc
+++ b/paddle/fluid/operators/xpu/mul_xpu_op.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/device_context_xpu_test.cc
+++ b/paddle/fluid/platform/device_context_xpu_test.cc
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
--- a/paddle/fluid/platform/xpu_header.h
+++ b/paddle/fluid/platform/xpu_header.h
--- a/paddle/fluid/platform/xpu_info.cc
+++ b/paddle/fluid/platform/xpu_info.cc
--- a/paddle/fluid/platform/xpu_info.h
+++ b/paddle/fluid/platform/xpu_info.h
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
--- a/paddle/fluid/pybind/generator_py.h
+++ b/paddle/fluid/pybind/generator_py.h
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
--- a/python/paddle/distributed/fleet/runtime/runtime_base.py
+++ b/python/paddle/distributed/fleet/runtime/runtime_base.py
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
--- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
--- a/python/paddle/fluid/contrib/tests/test_distributed_reader.py
+++ b/python/paddle/fluid/contrib/tests/test_distributed_reader.py
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/dataloader/__init__.py
+++ b/python/paddle/fluid/dataloader/__init__.py
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
--- a/python/paddle/fluid/dataloader/sampler.py
+++ b/python/paddle/fluid/dataloader/sampler.py
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
--- a/python/paddle/fluid/dygraph_utils.py
+++ b/python/paddle/fluid/dygraph_utils.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/generator.py
+++ b/python/paddle/fluid/generator.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/__init__.py
+++ b/python/paddle/fluid/tests/unittests/__init__.py
--- a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
+++ b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
--- a/python/paddle/fluid/tests/unittests/collective_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
--- a/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
--- a/python/paddle/fluid/tests/unittests/collective_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
--- a/python/paddle/fluid/tests/unittests/test_hdfs.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs.py
--- a/python/paddle/fluid/tests/unittests/launch_function_helper.py
+++ b/python/paddle/fluid/tests/unittests/launch_function_helper.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
--- a/python/paddle/fluid/tests/unittests/parallel_test.sh
+++ b/python/paddle/fluid/tests/unittests/parallel_test.sh
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
--- a/python/paddle/fluid/tests/unittests/test_adamax_api.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
--- a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_function.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_function.py
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
--- a/python/paddle/fluid/tests/unittests/test_allclose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_allclose_op.py
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
--- a/python/paddle/fluid/tests/unittests/test_batch_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
--- a/python/paddle/fluid/tests/unittests/test_bce_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py
--- a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
--- a/python/paddle/fluid/tests/unittests/test_bilinear_api.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_api.py
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
--- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
--- a/python/paddle/fluid/tests/unittests/test_chunk_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_op.py
--- a/python/paddle/fluid/tests/unittests/test_clamp.py
+++ b/python/paddle/fluid/tests/unittests/test_clamp.py
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
--- a/python/paddle/fluid/tests/unittests/test_collective_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_base.py
--- a/python/paddle/fluid/tests/unittests/test_metrics.py
+++ b/python/paddle/fluid/tests/unittests/test_metrics.py
--- a/python/paddle/fluid/tests/unittests/test_collective_scatter.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
--- a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
--- a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
+++ b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
--- a/python/paddle/fluid/tests/unittests/test_data.py
+++ b/python/paddle/fluid/tests/unittests/test_data.py
--- a/python/paddle/fluid/tests/unittests/test_data_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
--- a/python/paddle/fluid/tests/unittests/test_default_dtype.py
+++ b/python/paddle/fluid/tests/unittests/test_default_dtype.py
--- a/python/paddle/fluid/tests/unittests/test_device.py
+++ b/python/paddle/fluid/tests/unittests/test_device.py
--- a/python/paddle/fluid/tests/unittests/test_diag.py
+++ b/python/paddle/fluid/tests/unittests/test_diag.py
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
--- a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
--- a/python/paddle/fluid/tests/unittests/test_erf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_erf_op.py
--- a/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
--- a/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
--- a/python/paddle/fluid/tests/unittests/test_fleet.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
--- a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
--- a/python/paddle/fluid/tests/unittests/test_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_generator.py
--- a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
--- a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
--- a/python/paddle/fluid/tests/unittests/test_hdfs1.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs1.py
--- a/python/paddle/fluid/tests/unittests/test_hdfs2.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py
--- a/python/paddle/fluid/tests/unittests/test_hdfs3.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs3.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
--- a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
--- a/python/paddle/fluid/tests/unittests/test_linspace.py
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
--- a/python/paddle/fluid/tests/unittests/test_logical_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logical_op.py
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_masked_select_op.py
+++ b/python/paddle/fluid/tests/unittests/test_masked_select_op.py
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
--- a/python/paddle/fluid/tests/unittests/test_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_max_op.py
--- a/python/paddle/fluid/tests/unittests/test_maximum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maximum_op.py
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
--- a/python/paddle/fluid/tests/unittests/test_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_min_op.py
--- a/python/paddle/fluid/tests/unittests/test_minimum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minimum_op.py
--- a/python/paddle/fluid/tests/unittests/test_mse_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
--- a/python/paddle/fluid/tests/unittests/test_multiply.py
+++ b/python/paddle/fluid/tests/unittests/test_multiply.py
--- a/python/paddle/fluid/tests/unittests/test_nll_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nll_loss.py
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
--- a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
--- a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
--- a/python/paddle/fluid/tests/unittests/test_normal.py
+++ b/python/paddle/fluid/tests/unittests/test_normal.py
--- a/python/paddle/fluid/tests/unittests/test_normalize.py
+++ b/python/paddle/fluid/tests/unittests/test_normalize.py
--- a/python/paddle/fluid/tests/unittests/test_numel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
--- a/python/paddle/fluid/tests/unittests/test_pad3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
--- a/python/paddle/fluid/tests/unittests/test_pool2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
--- a/python/paddle/fluid/tests/unittests/test_prod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prod_op.py
--- a/python/paddle/fluid/tests/unittests/test_query_op.py
+++ b/python/paddle/fluid/tests/unittests/test_query_op.py
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
--- a/python/paddle/fluid/tests/unittests/test_randn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randn_op.py
--- a/python/paddle/fluid/tests/unittests/test_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_random_seed.py
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
--- a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
--- a/python/paddle/fluid/tests/unittests/test_std_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_std_layer.py
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
--- a/python/paddle/fluid/tests/unittests/test_variance_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_variance_layer.py
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
--- a/python/paddle/incubate/complex/tensor/math.py
+++ b/python/paddle/incubate/complex/tensor/math.py
--- a/python/paddle/incubate/hapi/__init__.py
+++ b/python/paddle/incubate/hapi/__init__.py
--- a/python/paddle/incubate/hapi/callbacks.py
+++ b/python/paddle/incubate/hapi/callbacks.py
--- a/python/paddle/incubate/hapi/datasets/__init__.py
+++ b/python/paddle/incubate/hapi/datasets/__init__.py
--- a/python/paddle/incubate/hapi/datasets/cifar.py
+++ b/python/paddle/incubate/hapi/datasets/cifar.py
--- a/python/paddle/incubate/hapi/datasets/conll05.py
+++ b/python/paddle/incubate/hapi/datasets/conll05.py
--- a/python/paddle/incubate/hapi/datasets/flowers.py
+++ b/python/paddle/incubate/hapi/datasets/flowers.py
--- a/python/paddle/incubate/hapi/datasets/imdb.py
+++ b/python/paddle/incubate/hapi/datasets/imdb.py
--- a/python/paddle/incubate/hapi/datasets/imikolov.py
+++ b/python/paddle/incubate/hapi/datasets/imikolov.py
--- a/python/paddle/incubate/hapi/datasets/mnist.py
+++ b/python/paddle/incubate/hapi/datasets/mnist.py
--- a/python/paddle/incubate/hapi/datasets/movie_reviews.py
+++ b/python/paddle/incubate/hapi/datasets/movie_reviews.py
--- a/python/paddle/incubate/hapi/datasets/movielens.py
+++ b/python/paddle/incubate/hapi/datasets/movielens.py
--- a/python/paddle/incubate/hapi/datasets/uci_housing.py
+++ b/python/paddle/incubate/hapi/datasets/uci_housing.py
--- a/python/paddle/incubate/hapi/datasets/voc2012.py
+++ b/python/paddle/incubate/hapi/datasets/voc2012.py
--- a/python/paddle/incubate/hapi/datasets/wmt14.py
+++ b/python/paddle/incubate/hapi/datasets/wmt14.py
--- a/python/paddle/incubate/hapi/datasets/wmt16.py
+++ b/python/paddle/incubate/hapi/datasets/wmt16.py
--- a/python/paddle/incubate/hapi/distributed.py
+++ b/python/paddle/incubate/hapi/distributed.py
--- a/python/paddle/incubate/hapi/metrics.py
+++ b/python/paddle/incubate/hapi/metrics.py
--- a/python/paddle/incubate/hapi/model.py
+++ b/python/paddle/incubate/hapi/model.py
--- a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
--- a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
--- a/python/paddle/incubate/hapi/tests/test_callbacks.py
+++ b/python/paddle/incubate/hapi/tests/test_callbacks.py
--- a/python/paddle/incubate/hapi/tests/test_dataset_cifar.py
+++ b/python/paddle/incubate/hapi/tests/test_dataset_cifar.py
--- a/python/paddle/incubate/hapi/tests/test_dataset_conll05.py
+++ b/python/paddle/incubate/hapi/tests/test_dataset_conll05.py
--- a/python/paddle/incubate/hapi/tests/test_dataset_imdb.py
+++ b/python/paddle/incubate/hapi/tests/test_dataset_imdb.py
--- a/python/paddle/incubate/hapi/tests/test_dataset_imikolov.py
+++ b/python/paddle/incubate/hapi/tests/test_dataset_imikolov.py
--- a/python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py
+++ b/python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py
--- a/python/paddle/incubate/hapi/tests/test_dataset_movielens.py
+++ b/python/paddle/incubate/hapi/tests/test_dataset_movielens.py
--- a/python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py
+++ b/python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py
--- a/python/paddle/incubate/hapi/tests/test_dataset_voc.py
+++ b/python/paddle/incubate/hapi/tests/test_dataset_voc.py
--- a/python/paddle/incubate/hapi/tests/test_dataset_wmt.py
+++ b/python/paddle/incubate/hapi/tests/test_dataset_wmt.py
--- a/python/paddle/incubate/hapi/tests/test_model.py
+++ b/python/paddle/incubate/hapi/tests/test_model.py
--- a/python/paddle/incubate/hapi/tests/test_pretrained_model.py
+++ b/python/paddle/incubate/hapi/tests/test_pretrained_model.py
--- a/python/paddle/incubate/hapi/tests/test_text.py
+++ b/python/paddle/incubate/hapi/tests/test_text.py
--- a/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
+++ b/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
--- a/python/paddle/incubate/hapi/tests/test_vision_models.py
+++ b/python/paddle/incubate/hapi/tests/test_vision_models.py
--- a/python/paddle/incubate/hapi/vision/models/lenet.py
+++ b/python/paddle/incubate/hapi/vision/models/lenet.py
--- a/python/paddle/incubate/hapi/vision/models/vgg.py
+++ b/python/paddle/incubate/hapi/vision/models/vgg.py
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
--- a/python/paddle/metric/__init__.py
+++ b/python/paddle/metric/__init__.py
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
--- a/python/paddle/nn/input.py
+++ b/python/paddle/nn/input.py
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
--- a/python/paddle/optimizer/lr_scheduler.py
+++ b/python/paddle/optimizer/lr_scheduler.py
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
--- a/python/paddle/incubate/hapi/tests/test_metrics.py
+++ b/python/paddle/incubate/hapi/tests/test_metrics.py
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
--- a/python/requirements.txt
+++ b/python/requirements.txt
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
--- a/tools/wlist.json
+++ b/tools/wlist.json