diff --git a/CMakeLists.txt b/CMakeLists.txt
index f30671bd3a87e87732b3a047e91811452370e06e..28dc39920c6d0748588168820f5043b2360e3ac9 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,13 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 include(system)
 
+# Note(zhouwei): Ninja Generator will set CMAKE_BUILD_TYPE to Debug
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING
+      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+      FORCE)
+endif()
+
 project(paddle CXX C)
 
 # enable language CUDA
@@ -213,12 +220,6 @@ if(NOT PY_VERSION)
 endif()
 set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 
-# CMAKE_BUILD_TYPE
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE STRING
-      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-      FORCE)
-endif()
 
 # the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined. Default: OFF
 if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$")
diff --git a/README.md b/README.md
index 8b437e4115abe80073866f52f3d7e387e2a554d3..d0a35332d474e32691b595df8c4c2d0e780bd344 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ We provide [English](https://www.paddlepaddle.org.cn/documentation/docs/en/guide
 ## Communication
 
 - [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
-- QQ discussion group: 778260830 (PaddlePaddle).
+- QQ discussion group: 793866180 (PaddlePaddle).
 - [Forums](https://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
 
 ## Copyright and License
diff --git a/README_cn.md b/README_cn.md
index 7a10cba2845498d2299fc516f5804eb1a84e4ecc..2be8be3df6e7b2f99cda1f34c9359a45e51ae5ea 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -83,7 +83,7 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 ## 交流与反馈
 
 - 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
-- QQ群: 778260830 (PaddlePaddle)
+- QQ群: 793866180 (PaddlePaddle)
 - [论坛](https://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
 
 ## 版权和许可证
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 7f2addb02d36ddf85cd08542cc5baab31d495bc5..033b40622e25943285a301ca5a219ac2667a376f 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -205,23 +205,16 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
 if(WIN32)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"/wd4244 /wd4267 /wd4819 \"")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /bigobj")
-  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    # match the cl's _ITERATOR_DEBUG_LEVEL
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"-g -G -D_DEBUG\"")
-    if(MSVC_STATIC_CRT)
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MTd")
-    else()
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MDd")
-    endif()
-  elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"-DNDEBUG\"")
-    if(MSVC_STATIC_CRT)
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MT")
-    else()
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MD")
-    endif()
-  else()
-    message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
+  if(MSVC_STATIC_CRT)
+    set(CMAKE_CUDA_FLAGS_DEBUG   "${CMAKE_CUDA_FLAGS_DEBUG} -Xcompiler /MTd")
+    set(CMAKE_CUDA_FLAGS_RELEASE  "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler /MT")
+    foreach(flag_var
+        CMAKE_CUDA_FLAGS CMAKE_CUDA_FLAGS_DEBUG CMAKE_CUDA_FLAGS_RELEASE
+        CMAKE_CUDA_FLAGS_MINSIZEREL CMAKE_CUDA_FLAGS_RELWITHDEBINFO)
+        if(${flag_var} MATCHES "-MD")
+            string(REGEX REPLACE "-MD" "-MT" ${flag_var} "${${flag_var}}")
+        endif()
+    endforeach(flag_var)
   endif()
 endif()
 
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index fb1d4d9d56dcc6f38a86242b4d78b88ef31ddaa0..c37e28523f43c58c7cf5752ff1e1d26e9c3db4fd 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,8 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            f58682cd8bd0615f41d879f8afc8f1511ab42d24)
+SET(MKLDNN_TAG            f3999b71d8e4415c1985a0dfb812a3ed77ee21fa)
+
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
@@ -59,8 +60,8 @@ ExternalProject_Add(
     DEPENDS             ${MKLDNN_DEPENDS}
     PREFIX              ${MKLDNN_PREFIX_DIR}
     SOURCE_DIR          ${MKLDNN_SOURCE_DIR}
-    BUILD_ALWAYS        1
-    # UPDATE_COMMAND      ""
+    UPDATE_COMMAND      ""
+    #BUILD_ALWAYS        1
     CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                         -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index c591a9391dfa5d3b5a452ffbb5a5d3199d387519..b0ea338d20525d8c40f2d4c1c92363c777c6ca67 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -100,9 +100,9 @@ else()
         "${WARPCTC_DOWNLOAD_CMD}"
         PREFIX          ${WARPCTC_PREFIX_DIR}
         SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
-        #UPDATE_COMMAND  ""
+        UPDATE_COMMAND  ""
         PATCH_COMMAND   ""
-        BUILD_ALWAYS    1
+        #BUILD_ALWAYS    1
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                         -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index f846623602ed79a5bd84268436a59ede1957364b..a03ff7d22dcad211af354fd5d51e0d3a44389885 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -13,7 +13,7 @@ if(NOT XPU_SDK_ROOT)
   elseif(WITH_SUNWAY)
       SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
   else()
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_04_09.tar.gz" CACHE STRING "" FORCE)
+      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_05_19.tar.gz" CACHE STRING "" FORCE)
   endif()
 
   SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index a2ddad557c2956f7de21bceaf7a6699e8dfbed43..94fd29b905009ba1202abdb184d7c35b185489e5 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -28,7 +28,12 @@ function(CheckCompilerCXX14Flag)
 endfunction()
 
 CheckCompilerCXX14Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+if(NOT WIN32)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+else()
+    set(CMAKE_CXX_STANDARD 14)
+endif()
+
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index a5c74a46631e9d76fa78261f706a1853a80bab32..cea65f17fbe836ee5951805dfdf5d3078087ba44 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -92,7 +92,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 # including io directory for inference lib paddle_api.h
 include_directories("${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io")
 
-if(NOT APPLE)
+if(NOT APPLE AND NOT WIN32)
   find_package(Threads REQUIRED)
   link_libraries(${CMAKE_THREAD_LIBS_INIT})
   if(WITH_PSLIB OR WITH_DISTRIBUTE)
@@ -100,7 +100,7 @@ if(NOT APPLE)
   else()
     set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
   endif()
-endif(NOT APPLE)
+endif()
 
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
 # find all fluid modules is used for paddle fluid static library
@@ -391,7 +391,7 @@ function(cc_binary TARGET_NAME)
 endfunction(cc_binary)
 
 function(cc_test_build TARGET_NAME)
-  if(WITH_TESTING)
+  if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -409,14 +409,12 @@ function(cc_test_build TARGET_NAME)
     if(WITH_ROCM)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
     endif()
+    check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
   endif()
-
-  check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
-
 endfunction()
 
 function(cc_test_run TARGET_NAME)
-  if(WITH_TESTING)
+  if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     set(oneValueArgs "")
     set(multiValueArgs COMMAND ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 4bdcaeb4c5f3c088778126bf841f05d5157b7de9..0ebcdc8ceeebcabc2c7c639076939cef5c0fe546 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -17,16 +17,30 @@ if(NOT WIN32)
     set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
+    
+    if(WITH_GPU)
+        set(CMAKE_CUDA_FLAGS_DEBUG "-g")
+        set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
+        set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
+        set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
+    endif()
 else()
-    set(CMAKE_C_FLAGS_DEBUG "/Zi /DEBUG")
-    set(CMAKE_C_FLAGS_RELEASE "/O2 /DNDEBUG")
-    set(CMAKE_C_FLAGS_RELWITHDEBINFO "/O2 /DNDEBUG")
-    set(CMAKE_C_FLAGS_MINSIZEREL "/Os /DNDEBUG")
+    set(CMAKE_C_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1")
+    set(CMAKE_C_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG")
+    set(CMAKE_C_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG")
+    set(CMAKE_C_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG")
 
-    set(CMAKE_CXX_FLAGS_DEBUG "/Zi /DEBUG")
-    set(CMAKE_CXX_FLAGS_RELEASE "/O2 /DNDEBUG")
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/O2 /DNDEBUG")
-    set(CMAKE_CXX_FLAGS_MINSIZEREL "/Os /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1")
+    set(CMAKE_CXX_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG")
+
+    if(WITH_GPU)
+        set(CMAKE_CUDA_FLAGS_DEBUG "-Xcompiler=\"-MDd -Zi -Ob0 -Od /RTC1\"")
+        set(CMAKE_CUDA_FLAGS_RELEASE "-Xcompiler=\"-MD -O2 -Ob2\" -DNDEBUG")
+        set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-Xcompiler=\"-MD -Zi -O2 -Ob1\" -DNDEBUG")
+        set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Xcompiler=\"-MD -O1 -Ob1\" -DNDEBUG")
+    endif()
 
     # It can specify CUDA compile flag manualy,
     # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
@@ -34,10 +48,3 @@ else()
     # Now, it's only used in VS2015 + CUDA:[10.0, 10.2]
     set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
 endif()
-
-if(WITH_GPU)
-    set(CMAKE_CUDA_FLAGS_DEBUG "-g")
-    set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
-    set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
-    set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
-endif()
diff --git a/paddle/fluid/distributed/common/sparse_sharding_merge.h b/paddle/fluid/distributed/common/sparse_sharding_merge.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f84b5c4b212e2b261a4ef9b3f21163e5ef705b2
--- /dev/null
+++ b/paddle/fluid/distributed/common/sparse_sharding_merge.h
@@ -0,0 +1,311 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <sys/time.h>
+
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include <ThreadPool.h>
+#include "boost/lexical_cast.hpp"
+#include "glog/logging.h"
+#include "paddle/fluid/distributed/common/utils.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/dim.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/string/split.h"
+
+constexpr int FG = 256 * 1024 * 1024;
+constexpr int Q_SIZE = 10000;
+constexpr int BUCKET = 10;
+constexpr char XEOF[] = "EOF";
+
+using boost::lexical_cast;
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+namespace paddle {
+namespace distributed {
+
+class ShardingMerge {
+ public:
+  ShardingMerge() {}
+  ~ShardingMerge() {}
+
+  void Merge(const std::vector<std::string> &inputs,
+             const std::vector<int64_t> &feasigns, const std::string &output,
+             const int embedding_dim) {
+    pool_.reset(new ::ThreadPool(inputs.size()));
+
+    std::vector<std::future<int>> tasks(inputs.size());
+    std::vector<std::vector<int64_t>> rows;
+    rows.resize(inputs.size());
+
+    auto begin = GetCurrentUS();
+    for (int x = 0; x < inputs.size(); ++x) {
+      tasks[x] = pool_->enqueue([this, x, &rows, &inputs, &feasigns]() -> int {
+        DeserializeRowsFromFile(inputs[x], feasigns[x], &rows[x]);
+        return 0;
+      });
+    }
+
+    for (size_t x = 0; x < tasks.size(); ++x) {
+      tasks[x].wait();
+    }
+
+    int64_t total_rows = 0;
+    for (auto x = 0; x < rows.size(); x++) {
+      total_rows += rows[x].size();
+    }
+
+    auto end = GetCurrentUS();
+
+    VLOG(0) << "got " << total_rows
+            << " feasigin ids from sparse embedding using " << end - begin;
+
+    std::vector<int64_t> total_dims = {total_rows,
+                                       static_cast<int64_t>(embedding_dim)};
+
+    std::vector<std::vector<int>> batch_buckets;
+    batch_buckets.resize(inputs.size());
+
+    for (int x = 0; x < rows.size(); ++x) {
+      batch_buckets[x] = bucket(rows[x].size(), BUCKET);
+    }
+
+    std::ofstream out(output, std::ios::binary);
+
+    begin = GetCurrentUS();
+    SerializeRowsToStream(out, rows, batch_buckets, total_rows);
+    end = GetCurrentUS();
+    VLOG(0) << "write rows to oostrream using " << end - begin;
+
+    begin = GetCurrentUS();
+    SerializePreTensorToStream(out, total_dims);
+    end = GetCurrentUS();
+    VLOG(0) << "write pretensor to oostrream using " << end - begin;
+
+    begin = GetCurrentUS();
+    SerializeValueToStream(out, inputs, batch_buckets, embedding_dim);
+    end = GetCurrentUS();
+    VLOG(0) << "write values to oostrream using " << end - begin;
+  }
+
+ private:
+  void SerializeRowsToStream(std::ostream &os,
+                             const std::vector<std::vector<int64_t>> &rows,
+                             const std::vector<std::vector<int>> &batch_buckets,
+                             int64_t total_rows) {
+    {  // the 1st field, uint32_t version
+      constexpr uint32_t version = 0;
+      os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+    }
+
+    {
+      // the 2st field, rows information
+      os.write(reinterpret_cast<const char *>(&total_rows), sizeof(total_rows));
+
+      for (int b = 0; b < BUCKET; ++b) {
+        for (int x = 0; x < batch_buckets.size(); ++x) {
+          auto begin = batch_buckets[x][b];
+          auto end = batch_buckets[x][b + 1];
+
+          if (end - begin == 0) continue;
+
+          os.write(reinterpret_cast<const char *>(rows[x].data() + begin),
+                   sizeof(int64_t) * (end - begin));
+        }
+      }
+
+      // the 3st field, the height of SelectedRows
+      int64_t height = total_rows;
+      os.write(reinterpret_cast<const char *>(&height), sizeof(height));
+    }
+  }
+
+  void SerializePreTensorToStream(std::ostream &os,
+                                  const std::vector<int64_t> &dims) {
+    {  // the 1st field, uint32_t version
+      constexpr uint32_t version = 0;
+      os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+    }
+    {  // the 2nd field, tensor description
+      // int32_t  size
+      framework::proto::VarType::TensorDesc desc;
+      desc.set_data_type(framework::proto::VarType::FP32);
+      auto *pb_dims = desc.mutable_dims();
+      pb_dims->Resize(static_cast<int>(dims.size()), 0);
+      std::copy(dims.begin(), dims.end(), pb_dims->begin());
+      int32_t size = desc.ByteSize();
+      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+      auto out = desc.SerializeAsString();
+      os.write(out.data(), size);
+    }
+  }
+
+  void SerializeValueToVec(std::ifstream &in, const int batch,
+                           const int embedding_dim, std::vector<float> *out) {
+    auto queue =
+        std::make_shared<framework::BlockingQueue<std::vector<std::string>>>();
+
+    auto read = [batch, &in, &queue]() {
+      std::string line;
+      std::vector<std::string> columns;
+      std::vector<std::string> values_str;
+
+      int count = 0;
+
+      while (std::getline(in, line)) {
+        ++count;
+        columns = string::Split(line, '\t');
+
+        if (columns.size() != 5) {
+          VLOG(0) << "unexpected line: " << line << ", skip it";
+          continue;
+        }
+
+        values_str = string::Split(columns[4], ',');
+        queue->Push(values_str);
+
+        if (count >= batch) {
+          break;
+        }
+      }
+      queue->Push({});
+    };
+
+    auto write = [embedding_dim, &out, &queue]() {
+      std::vector<std::string> values_str;
+      std::string line;
+
+      while (true) {
+        queue->Pop(&values_str);
+
+        if (values_str.size() == 0) {
+          break;
+        }
+
+        for (int x = 0; x < embedding_dim; ++x) {
+          float v = 0.0;
+          try {
+            v = lexical_cast<float>(values_str[x]);
+          } catch (boost::bad_lexical_cast &e) {
+            VLOG(0) << " get unexpected line: " << line;
+          }
+          out->push_back(v);
+        }
+      }
+    };
+
+    std::thread p_read(read);
+    std::thread p_write(write);
+    p_read.join();
+    p_write.join();
+  }
+
+  void SerializeVecToStream(std::ostream &out,
+                            const std::vector<float> &value) {
+    out.write(reinterpret_cast<const char *>(value.data()),
+              static_cast<std::streamsize>(sizeof(float) * value.size()));
+  }
+
+  void SerializeValueToStream(
+      std::ostream &out, const std::vector<std::string> &ins,
+      const std::vector<std::vector<int>> &batch_buckets,
+      const int embedding_dim) {
+    std::vector<std::shared_ptr<std::ifstream>> in_streams;
+
+    for (int x = 0; x < ins.size(); ++x) {
+      in_streams.emplace_back(std::make_shared<std::ifstream>(ins[x]));
+    }
+
+    std::vector<std::future<int>> tasks(ins.size());
+
+    for (int b = 0; b < BUCKET; ++b) {
+      std::vector<std::vector<float>> values;
+      values.resize(tasks.size());
+
+      auto begin = GetCurrentUS();
+
+      for (int x = 0; x < tasks.size(); ++x) {
+        auto batch = batch_buckets[x][b + 1] - batch_buckets[x][b];
+        values[x].clear();
+        values[x].reserve(batch * embedding_dim);
+      }
+
+      for (int x = 0; x < tasks.size(); ++x) {
+        tasks[x] =
+            pool_->enqueue([this, b, x, &out, &in_streams, &batch_buckets,
+                            &values, embedding_dim]() -> int {
+              auto batch = batch_buckets[x][b + 1] - batch_buckets[x][b];
+              if (batch == 0) return 0;
+              SerializeValueToVec(*(in_streams[x].get()), batch, embedding_dim,
+                                  &values[x]);
+              return 0;
+            });
+      }
+
+      for (size_t x = 0; x < tasks.size(); ++x) {
+        tasks[x].wait();
+      }
+
+      auto end = GetCurrentUS();
+
+      auto begin1 = GetCurrentUS();
+      for (size_t x = 0; x < tasks.size(); ++x) {
+        SerializeVecToStream(out, values[x]);
+      }
+      auto end1 = GetCurrentUS();
+
+      VLOG(0) << "serialize buckets " << b << " read using " << end - begin
+              << ", to oostream using " << end1 - begin1;
+    }
+  }
+
+  void DeserializeRowsFromFile(const std::string &input_file,
+                               const int64_t feasigns,
+                               std::vector<int64_t> *rows) {
+    std::string line;
+    std::vector<std::string> columns;
+    std::ifstream file(input_file);
+
+    rows->reserve(feasigns);
+
+    while (std::getline(file, line)) {
+      columns = string::Split(line, '\t');
+      if (columns.size() != 5) {
+        VLOG(0) << "unexpected line: " << line << ", skip it";
+        continue;
+      }
+      rows->push_back(std::stoull(columns[0]));
+    }
+
+    VLOG(0) << "parse " << rows->size() << " embedding rows from "
+            << input_file;
+  }
+
+ private:
+  std::unique_ptr<::ThreadPool> pool_;
+};
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/common/utils.h b/paddle/fluid/distributed/common/utils.h
index f81f84b1e117510443a5698a6ba1574262f640a5..2305001ad6f8f90eea49efa88b2a2615176f3ffb 100644
--- a/paddle/fluid/distributed/common/utils.h
+++ b/paddle/fluid/distributed/common/utils.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <sys/time.h>
+
 #include <functional>
 #include <memory>
 #include <string>
@@ -83,5 +85,11 @@ std::string to_string(const std::vector<T>& vec) {
   }
   return ss.str();
 }
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
 }
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc
index eafb4d596cc1671db26189b84ea9d0c0c31ea398..70f2da6d7252cee0268bdd35999926a232bc5b34 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_client.cc
@@ -80,11 +80,11 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
       [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) {
         int ret = 0;
         auto *closure = (DownpourBrpcClosure *)done;
-        int fail_num = 0;
+        size_t fail_num = 0;
         for (int request_idx = 0; request_idx < request_call_num;
              ++request_idx) {
-          if (closure->check_response(request_idx,
-                                      PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) {
+          if (closure->check_response(request_idx, PS_GRAPH_GET_NODE_FEAT) !=
+              0) {
             ++fail_num;
           } else {
             auto &res_io_buffer =
@@ -144,6 +144,163 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
 
   return fut;
 }
+
+std::future<int32_t> GraphBrpcClient::clear_nodes(uint32_t table_id) {
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      server_size, [&, server_size = this->server_size ](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        size_t fail_num = 0;
+        for (size_t request_idx = 0; request_idx < server_size; ++request_idx) {
+          if (closure->check_response(request_idx, PS_GRAPH_CLEAR) != 0) {
+            ++fail_num;
+            break;
+          }
+        }
+        ret = fail_num == 0 ? 0 : -1;
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  for (size_t i = 0; i < server_size; i++) {
+    int server_index = i;
+    closure->request(server_index)->set_cmd_id(PS_GRAPH_CLEAR);
+    closure->request(server_index)->set_table_id(table_id);
+    closure->request(server_index)->set_client_id(_client_id);
+
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(server_index)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(server_index),
+                     closure->request(server_index),
+                     closure->response(server_index), closure);
+  }
+  return fut;
+}
+std::future<int32_t> GraphBrpcClient::add_graph_node(
+    uint32_t table_id, std::vector<uint64_t> &node_id_list,
+    std::vector<bool> &is_weighted_list) {
+  std::vector<std::vector<uint64_t>> request_bucket;
+  std::vector<std::vector<bool>> is_weighted_bucket;
+  bool add_weight = is_weighted_list.size() > 0;
+  std::vector<int> server_index_arr;
+  std::vector<int> index_mapping(server_size, -1);
+  for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_id_list[query_idx]);
+    if (index_mapping[server_index] == -1) {
+      index_mapping[server_index] = request_bucket.size();
+      server_index_arr.push_back(server_index);
+      request_bucket.push_back(std::vector<uint64_t>());
+      if (add_weight) is_weighted_bucket.push_back(std::vector<bool>());
+    }
+    request_bucket[index_mapping[server_index]].push_back(
+        node_id_list[query_idx]);
+    if (add_weight)
+      is_weighted_bucket[index_mapping[server_index]].push_back(
+          query_idx < is_weighted_list.size() ? is_weighted_list[query_idx]
+                                              : false);
+  }
+  size_t request_call_num = request_bucket.size();
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [&, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        size_t fail_num = 0;
+        for (size_t request_idx = 0; request_idx < request_call_num;
+             ++request_idx) {
+          if (closure->check_response(request_idx, PS_GRAPH_ADD_GRAPH_NODE) !=
+              0) {
+            ++fail_num;
+          }
+        }
+        ret = fail_num == request_call_num ? -1 : 0;
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (size_t request_idx = 0; request_idx < request_call_num; ++request_idx) {
+    int server_index = server_index_arr[request_idx];
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_ADD_GRAPH_NODE);
+    closure->request(request_idx)->set_table_id(table_id);
+    closure->request(request_idx)->set_client_id(_client_id);
+    size_t node_num = request_bucket[request_idx].size();
+    closure->request(request_idx)
+        ->add_params((char *)request_bucket[request_idx].data(),
+                     sizeof(uint64_t) * node_num);
+    if (add_weight) {
+      bool weighted[is_weighted_bucket[request_idx].size() + 1];
+      for (size_t j = 0; j < is_weighted_bucket[request_idx].size(); j++)
+        weighted[j] = is_weighted_bucket[request_idx][j];
+      closure->request(request_idx)
+          ->add_params((char *)weighted,
+                       sizeof(bool) * is_weighted_bucket[request_idx].size());
+    }
+    // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
+                     closure->response(request_idx), closure);
+  }
+  return fut;
+}
+std::future<int32_t> GraphBrpcClient::remove_graph_node(
+    uint32_t table_id, std::vector<uint64_t> &node_id_list) {
+  std::vector<std::vector<uint64_t>> request_bucket;
+  std::vector<int> server_index_arr;
+  std::vector<int> index_mapping(server_size, -1);
+  for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_id_list[query_idx]);
+    if (index_mapping[server_index] == -1) {
+      index_mapping[server_index] = request_bucket.size();
+      server_index_arr.push_back(server_index);
+      request_bucket.push_back(std::vector<uint64_t>());
+    }
+    request_bucket[index_mapping[server_index]].push_back(
+        node_id_list[query_idx]);
+  }
+  size_t request_call_num = request_bucket.size();
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [&, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        int fail_num = 0;
+        for (size_t request_idx = 0; request_idx < request_call_num;
+             ++request_idx) {
+          if (closure->check_response(request_idx,
+                                      PS_GRAPH_REMOVE_GRAPH_NODE) != 0) {
+            ++fail_num;
+          }
+        }
+        ret = fail_num == request_call_num ? -1 : 0;
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (size_t request_idx = 0; request_idx < request_call_num; ++request_idx) {
+    int server_index = server_index_arr[request_idx];
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_REMOVE_GRAPH_NODE);
+    closure->request(request_idx)->set_table_id(table_id);
+    closure->request(request_idx)->set_client_id(_client_id);
+    size_t node_num = request_bucket[request_idx].size();
+
+    closure->request(request_idx)
+        ->add_params((char *)request_bucket[request_idx].data(),
+                     sizeof(uint64_t) * node_num);
+    // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
+                     closure->response(request_idx), closure);
+  }
+  return fut;
+}
 // char* &buffer,int &actual_size
 std::future<int32_t> GraphBrpcClient::batch_sample_neighboors(
     uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
@@ -174,8 +331,8 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighboors(
       [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) {
         int ret = 0;
         auto *closure = (DownpourBrpcClosure *)done;
-        int fail_num = 0;
-        for (int request_idx = 0; request_idx < request_call_num;
+        size_t fail_num = 0;
+        for (size_t request_idx = 0; request_idx < request_call_num;
              ++request_idx) {
           if (closure->check_response(request_idx,
                                       PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) {
@@ -254,13 +411,14 @@ std::future<int32_t> GraphBrpcClient::random_sample_nodes(
       auto &res_io_buffer = closure->cntl(0)->response_attachment();
       butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
       size_t bytes_size = io_buffer_itr.bytes_left();
-      char buffer[bytes_size];
+      char *buffer = new char[bytes_size];
       auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
       int index = 0;
       while (index < bytes_size) {
         ids.push_back(*(uint64_t *)(buffer + index));
         index += GraphNode::id_size;
       }
+      delete[] buffer;
     }
     closure->set_promise_value(ret);
   });
@@ -292,7 +450,7 @@ std::future<int32_t> GraphBrpcClient::pull_graph_list(
       auto &res_io_buffer = closure->cntl(0)->response_attachment();
       butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
       size_t bytes_size = io_buffer_itr.bytes_left();
-      char buffer[bytes_size];
+      char *buffer = new char[bytes_size];
       io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
       int index = 0;
       while (index < bytes_size) {
@@ -301,6 +459,7 @@ std::future<int32_t> GraphBrpcClient::pull_graph_list(
         index += node.get_size(false);
         res.push_back(node);
       }
+      delete buffer;
     }
     closure->set_promise_value(ret);
   });
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h
index 4e6775a4bedaf1a4028fe483f58be818ef1e3581..5696e8b08037b7027939f472f58ec79925143e4f 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/service/graph_brpc_client.h
@@ -78,6 +78,13 @@ class GraphBrpcClient : public BrpcPsClient {
       const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
       const std::vector<std::string>& feature_names,
       std::vector<std::vector<std::string>>& res);
+
+  virtual std::future<int32_t> clear_nodes(uint32_t table_id);
+  virtual std::future<int32_t> add_graph_node(
+      uint32_t table_id, std::vector<uint64_t>& node_id_list,
+      std::vector<bool>& is_weighted_list);
+  virtual std::future<int32_t> remove_graph_node(
+      uint32_t table_id, std::vector<uint64_t>& node_id_list);
   virtual int32_t initialize();
   int get_shard_num() { return shard_num; }
   void set_shard_num(int shard_num) { this->shard_num = shard_num; }
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc
index bdd926278b624b9e9bfdf19a4f293784bef6e28f..52ac8c5d688a4ada72212923bdd478b788e422ee 100644
--- a/paddle/fluid/distributed/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_server.cc
@@ -24,6 +24,14 @@
 namespace paddle {
 namespace distributed {
 
+#define CHECK_TABLE_EXIST(table, request, response)        \
+  if (table == NULL) {                                     \
+    std::string err_msg("table not found with table_id:"); \
+    err_msg.append(std::to_string(request.table_id()));    \
+    set_response_code(response, -1, err_msg.c_str());      \
+    return -1;                                             \
+  }
+
 int32_t GraphBrpcServer::initialize() {
   auto &service_config = _config.downpour_server_param().service_param();
   if (!service_config.has_service_class()) {
@@ -71,6 +79,58 @@ uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) {
   return 0;
 }
 
+int32_t GraphBrpcService::clear_nodes(Table *table,
+                                      const PsRequestMessage &request,
+                                      PsResponseMessage &response,
+                                      brpc::Controller *cntl) {
+  ((GraphTable *)table)->clear_nodes();
+  return 0;
+}
+
+int32_t GraphBrpcService::add_graph_node(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 1) {
+    set_response_code(
+        response, -1,
+        "graph_get_node_feat request requires at least 2 arguments");
+    return 0;
+  }
+
+  size_t node_num = request.params(0).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  std::vector<bool> is_weighted_list;
+  if (request.params_size() == 2) {
+    size_t weight_list_size = request.params(1).size() / sizeof(bool);
+    bool *is_weighted_buffer = (bool *)(request.params(1).c_str());
+    is_weighted_list = std::vector<bool>(is_weighted_buffer,
+                                         is_weighted_buffer + weight_list_size);
+  }
+
+  ((GraphTable *)table)->add_graph_node(node_ids, is_weighted_list);
+  return 0;
+}
+int32_t GraphBrpcService::remove_graph_node(Table *table,
+                                            const PsRequestMessage &request,
+                                            PsResponseMessage &response,
+                                            brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 1) {
+    set_response_code(
+        response, -1,
+        "graph_get_node_feat request requires at least 1 argument");
+    return 0;
+  }
+  size_t node_num = request.params(0).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+
+  ((GraphTable *)table)->remove_graph_node(node_ids);
+  return 0;
+}
 int32_t GraphBrpcServer::port() { return _server.listen_address().port; }
 
 int32_t GraphBrpcService::initialize() {
@@ -92,21 +152,17 @@ int32_t GraphBrpcService::initialize() {
       &GraphBrpcService::graph_random_sample_nodes;
   _service_handler_map[PS_GRAPH_GET_NODE_FEAT] =
       &GraphBrpcService::graph_get_node_feat;
-
+  _service_handler_map[PS_GRAPH_CLEAR] = &GraphBrpcService::clear_nodes;
+  _service_handler_map[PS_GRAPH_ADD_GRAPH_NODE] =
+      &GraphBrpcService::add_graph_node;
+  _service_handler_map[PS_GRAPH_REMOVE_GRAPH_NODE] =
+      &GraphBrpcService::remove_graph_node;
   // shard初始化,server启动后才可从env获取到server_list的shard信息
   initialize_shard_info();
 
   return 0;
 }
 
-#define CHECK_TABLE_EXIST(table, request, response)        \
-  if (table == NULL) {                                     \
-    std::string err_msg("table not found with table_id:"); \
-    err_msg.append(std::to_string(request.table_id()));    \
-    set_response_code(response, -1, err_msg.c_str());      \
-    return -1;                                             \
-  }
-
 int32_t GraphBrpcService::initialize_shard_info() {
   if (!_is_initialize_shard_info) {
     std::lock_guard<std::mutex> guard(_initialize_shard_mutex);
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h
index 32c572f9e6c2bf759c59190679bcf7570a807f2d..47c370572826ac2807e4ea5cb36cf3a667dfed10 100644
--- a/paddle/fluid/distributed/service/graph_brpc_server.h
+++ b/paddle/fluid/distributed/service/graph_brpc_server.h
@@ -86,6 +86,13 @@ class GraphBrpcService : public PsBaseService {
   int32_t graph_get_node_feat(Table *table, const PsRequestMessage &request,
                               PsResponseMessage &response,
                               brpc::Controller *cntl);
+  int32_t clear_nodes(Table *table, const PsRequestMessage &request,
+                      PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t add_graph_node(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t remove_graph_node(Table *table, const PsRequestMessage &request,
+                            PsResponseMessage &response,
+                            brpc::Controller *cntl);
   int32_t barrier(Table *table, const PsRequestMessage &request,
                   PsResponseMessage &response, brpc::Controller *cntl);
   int32_t load_one_table(Table *table, const PsRequestMessage &request,
diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc
index 61e4e0cf7bb9155d25c630296c2b55a7d3400bfc..39befb1a112c854a183903d76a71d9e6c920b215 100644
--- a/paddle/fluid/distributed/service/graph_py_service.cc
+++ b/paddle/fluid/distributed/service/graph_py_service.cc
@@ -44,6 +44,9 @@ void GraphPyService::add_table_feat_conf(std::string table_name,
   }
 }
 
+void add_graph_node(std::vector<uint64_t> node_ids,
+                    std::vector<bool> weight_list) {}
+void remove_graph_node(std::vector<uint64_t> node_ids) {}
 void GraphPyService::set_up(std::string ips_str, int shard_num,
                             std::vector<std::string> node_types,
                             std::vector<std::string> edge_types) {
@@ -247,6 +250,34 @@ void GraphPyClient::load_edge_file(std::string name, std::string filepath,
   }
 }
 
+void GraphPyClient::clear_nodes(std::string name) {
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status = get_ps_client()->clear_nodes(table_id);
+    status.wait();
+  }
+}
+
+void GraphPyClient::add_graph_node(std::string name,
+                                   std::vector<uint64_t>& node_ids,
+                                   std::vector<bool>& weight_list) {
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        get_ps_client()->add_graph_node(table_id, node_ids, weight_list);
+    status.wait();
+  }
+}
+
+void GraphPyClient::remove_graph_node(std::string name,
+                                      std::vector<uint64_t>& node_ids) {
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status = get_ps_client()->remove_graph_node(table_id, node_ids);
+    status.wait();
+  }
+}
+
 void GraphPyClient::load_node_file(std::string name, std::string filepath) {
   // 'n' means load nodes and 'node_type' follows
   std::string params = "n" + name;
diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h
index c6657be96ba446d2f7538943aab43dd47e1868fb..da027fbae3e6f0ca1e902795b0640cee1e0b76cc 100644
--- a/paddle/fluid/distributed/service/graph_py_service.h
+++ b/paddle/fluid/distributed/service/graph_py_service.h
@@ -141,6 +141,10 @@ class GraphPyClient : public GraphPyService {
   void finalize_worker();
   void load_edge_file(std::string name, std::string filepath, bool reverse);
   void load_node_file(std::string name, std::string filepath);
+  void clear_nodes(std::string name);
+  void add_graph_node(std::string name, std::vector<uint64_t>& node_ids,
+                      std::vector<bool>& weight_list);
+  void remove_graph_node(std::string name, std::vector<uint64_t>& node_ids);
   int get_client_id() { return client_id; }
   void set_client_id(int client_id) { this->client_id = client_id; }
   void start_client();
diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto
index d908c26da9870a93d81c0242ac03e26cfebdb976..a4b811e950a3b56443261ceac37fa658007d519d 100644
--- a/paddle/fluid/distributed/service/sendrecv.proto
+++ b/paddle/fluid/distributed/service/sendrecv.proto
@@ -52,6 +52,9 @@ enum PsCmdID {
   PS_GRAPH_SAMPLE_NEIGHBOORS = 31;
   PS_GRAPH_SAMPLE_NODES = 32;
   PS_GRAPH_GET_NODE_FEAT = 33;
+  PS_GRAPH_CLEAR = 34;
+  PS_GRAPH_ADD_GRAPH_NODE = 35;
+  PS_GRAPH_REMOVE_GRAPH_NODE = 36;
 }
 
 message PsRequestMessage {
diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc
index 0dc99de1bfe82a691fdacb834acd1ad606dcb04b..92f8304a8bf62178988fef447fcf8c309d8589ea 100644
--- a/paddle/fluid/distributed/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/table/common_graph_table.cc
@@ -35,6 +35,77 @@ std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
 
 size_t GraphShard::get_size() { return bucket.size(); }
 
+int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
+                                   std::vector<bool> &is_weight_list) {
+  size_t node_size = id_list.size();
+  std::vector<std::vector<std::pair<uint64_t, bool>>> batch(task_pool_size_);
+  for (size_t i = 0; i < node_size; i++) {
+    size_t shard_id = id_list[i] % shard_num;
+    if (shard_id >= shard_end || shard_id < shard_start) {
+      continue;
+    }
+    batch[get_thread_pool_index(id_list[i])].push_back(
+        {id_list[i], i < is_weight_list.size() ? is_weight_list[i] : false});
+  }
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < batch.size(); ++i) {
+    if (!batch[i].size()) continue;
+    tasks.push_back(_shards_task_pool[i]->enqueue([&batch, i, this]() -> int {
+      for (auto &p : batch[i]) {
+        size_t index = p.first % this->shard_num - this->shard_start;
+        this->shards[index].add_graph_node(p.first)->build_edges(p.second);
+      }
+      return 0;
+    }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  return 0;
+}
+
+int32_t GraphTable::remove_graph_node(std::vector<uint64_t> &id_list) {
+  size_t node_size = id_list.size();
+  std::vector<std::vector<uint64_t>> batch(task_pool_size_);
+  for (size_t i = 0; i < node_size; i++) {
+    size_t shard_id = id_list[i] % shard_num;
+    if (shard_id >= shard_end || shard_id < shard_start) continue;
+    batch[get_thread_pool_index(id_list[i])].push_back(id_list[i]);
+  }
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < batch.size(); ++i) {
+    if (!batch[i].size()) continue;
+    tasks.push_back(_shards_task_pool[i]->enqueue([&batch, i, this]() -> int {
+      for (auto &p : batch[i]) {
+        size_t index = p % this->shard_num - this->shard_start;
+        this->shards[index].delete_node(p);
+      }
+      return 0;
+    }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  return 0;
+}
+
+void GraphShard::clear() {
+  for (size_t i = 0; i < bucket.size(); i++) {
+    delete bucket[i];
+  }
+  bucket.clear();
+  node_location.clear();
+}
+
+GraphShard::~GraphShard() { clear(); }
+void GraphShard::delete_node(uint64_t id) {
+  auto iter = node_location.find(id);
+  if (iter == node_location.end()) return;
+  int pos = iter->second;
+  delete bucket[pos];
+  if (pos != (int)bucket.size() - 1) {
+    bucket[pos] = bucket.back();
+    node_location[bucket.back()->get_id()] = pos;
+  }
+  node_location.erase(id);
+  bucket.pop_back();
+}
 GraphNode *GraphShard::add_graph_node(uint64_t id) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
@@ -79,11 +150,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
   int start = 0, end, index = 0, total_size = 0;
   res.clear();
   std::vector<std::future<std::vector<uint64_t>>> tasks;
-  // std::string temp = "";
-  // for(int i = 0;i < shards.size();i++)
-  //   temp+= std::to_string((int)shards[i].get_size()) + " ";
-  // VLOG(0)<<"range distribution "<<temp;
-  for (int i = 0; i < shards.size() && index < ranges.size(); i++) {
+  for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) {
     end = total_size + shards[i].get_size();
     start = total_size;
     while (start < end && index < ranges.size()) {
@@ -97,7 +164,6 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
         start = second;
         first -= total_size;
         second -= total_size;
-        // VLOG(0)<<" FIND RANGE "<<i<<" "<<first<<" "<<second;
         tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
             [this, first, second, i]() -> std::vector<uint64_t> {
               return shards[i].get_ids_by_range(first, second);
@@ -106,7 +172,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
     }
     total_size += shards[i].get_size();
   }
-  for (int i = 0; i < tasks.size(); i++) {
+  for (size_t i = 0; i < tasks.size(); i++) {
     auto vec = tasks[i].get();
     for (auto &id : vec) {
       res.push_back(id);
@@ -219,7 +285,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
 
   for (auto &shard : shards) {
     auto bucket = shard.get_bucket();
-    for (int i = 0; i < bucket.size(); i++) {
+    for (size_t i = 0; i < bucket.size(); i++) {
       bucket[i]->build_sampler(sample_type);
     }
   }
@@ -238,10 +304,29 @@ Node *GraphTable::find_node(uint64_t id) {
 uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
   return node_id % shard_num % shard_num_per_table % task_pool_size_;
 }
+
+uint32_t GraphTable::get_thread_pool_index_by_shard_index(
+    uint64_t shard_index) {
+  return shard_index % shard_num_per_table % task_pool_size_;
+}
+
+int32_t GraphTable::clear_nodes() {
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < shards.size(); i++) {
+    tasks.push_back(
+        _shards_task_pool[get_thread_pool_index_by_shard_index(i)]->enqueue(
+            [this, i]() -> int {
+              this->shards[i].clear();
+              return 0;
+            }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  return 0;
+}
+
 int32_t GraphTable::random_sample_nodes(int sample_size,
                                         std::unique_ptr<char[]> &buffer,
                                         int &actual_size) {
-  bool need_feature = false;
   int total_size = 0;
   for (int i = 0; i < shards.size(); i++) {
     total_size += shards[i].get_size();
@@ -281,7 +366,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
   }
   std::vector<std::pair<int, int>> first_half, second_half;
   int start_index = rand() % total_size;
-  for (int i = 0; i < ranges_len.size() && i < ranges_pos.size(); i++) {
+  for (size_t i = 0; i < ranges_len.size() && i < ranges_pos.size(); i++) {
     if (ranges_pos[i] + ranges_len[i] - 1 + start_index < total_size)
       first_half.push_back({ranges_pos[i] + start_index,
                             ranges_pos[i] + ranges_len[i] + start_index});
@@ -386,7 +471,6 @@ std::pair<int32_t, std::string> GraphTable::parse_feature(
   if (this->feat_id_map.count(fields[0])) {
     int32_t id = this->feat_id_map[fields[0]];
     std::string dtype = this->feat_dtype[id];
-    int32_t shape = this->feat_shape[id];
     std::vector<std::string> values(fields.begin() + 1, fields.end());
     if (dtype == "feasign") {
       return std::make_pair<int32_t, std::string>(
diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h
index b18da82abe61c9695712f542e187ac48fd5edc9d..5eeb3915f5b1f251dd5edf1a5199621a7cd0069b 100644
--- a/paddle/fluid/distributed/table/common_graph_table.h
+++ b/paddle/fluid/distributed/table/common_graph_table.h
@@ -36,11 +36,12 @@ class GraphShard {
   size_t get_size();
   GraphShard() {}
   GraphShard(int shard_num) { this->shard_num = shard_num; }
+  ~GraphShard();
   std::vector<Node *> &get_bucket() { return bucket; }
   std::vector<Node *> get_batch(int start, int end, int step);
   std::vector<uint64_t> get_ids_by_range(int start, int end) {
     std::vector<uint64_t> res;
-    for (int i = start; i < end && i < bucket.size(); i++) {
+    for (int i = start; i < end && i < (int)bucket.size(); i++) {
       res.push_back(bucket[i]->get_id());
     }
     return res;
@@ -48,6 +49,8 @@ class GraphShard {
   GraphNode *add_graph_node(uint64_t id);
   FeatureNode *add_feature_node(uint64_t id);
   Node *find_node(uint64_t id);
+  void delete_node(uint64_t id);
+  void clear();
   void add_neighboor(uint64_t id, uint64_t dst_id, float weight);
   std::unordered_map<uint64_t, int> get_node_location() {
     return node_location;
@@ -85,6 +88,11 @@ class GraphTable : public SparseTable {
 
   int32_t load_nodes(const std::string &path, std::string node_type);
 
+  int32_t add_graph_node(std::vector<uint64_t> &id_list,
+                         std::vector<bool> &is_weight_list);
+
+  int32_t remove_graph_node(std::vector<uint64_t> &id_list);
+
   Node *find_node(uint64_t id);
 
   virtual int32_t pull_sparse(float *values,
@@ -97,6 +105,7 @@ class GraphTable : public SparseTable {
     return 0;
   }
 
+  virtual int32_t clear_nodes();
   virtual void clear() {}
   virtual int32_t flush() { return 0; }
   virtual int32_t shrink(const std::string &param) { return 0; }
@@ -105,6 +114,7 @@ class GraphTable : public SparseTable {
     return 0;
   }
   virtual int32_t initialize_shard() { return 0; }
+  virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index);
   virtual uint32_t get_thread_pool_index(uint64_t node_id);
   virtual std::pair<int32_t, std::string> parse_feature(std::string feat_str);
 
@@ -128,4 +138,5 @@ class GraphTable : public SparseTable {
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
 };
 }  // namespace distributed
+
 };  // namespace paddle
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index a4f672c2963a84976c3b56dbee3b38f6675a2c36..b667aec186f9e343228d58c84e580554ed55a698 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -134,10 +134,23 @@ void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
   }
 }
 
-int64_t SaveToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
-                   const int mode) {
-  int64_t save_num = 0;
+void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
+                    const size_t shard_idx, const int64_t total) {
+  // save meta
+  std::stringstream stream;
+  stream << "param=" << common.table_name() << "\n";
+  stream << "shard_id=" << shard_idx << "\n";
+  stream << "row_names=" << paddle::string::join_strings(common.params(), ',')
+         << "\n";
+  stream << "row_dims=" << paddle::string::join_strings(common.dims(), ',')
+         << "\n";
+  stream << "count=" << total << "\n";
+  os->write(stream.str().c_str(), sizeof(char) * stream.str().size());
+}
 
+int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
+                        std::shared_ptr<::ThreadPool> pool, const int mode) {
+  int64_t save_num = 0;
   for (auto& table : block->values_) {
     for (auto& value : table) {
       if (mode == SaveMode::delta && !value.second->need_save_) {
@@ -334,16 +347,24 @@ int32_t CommonSparseTable::set_global_lr(float* lr) {
 
 int32_t CommonSparseTable::load(const std::string& path,
                                 const std::string& param) {
+  auto begin = GetCurrentUS();
   rwlock_->WRLock();
-  VLOG(3) << "sparse table load with " << path << " with meta " << param;
   LoadFromText(path, param, _shard_idx, _shard_num, task_pool_size_,
                &shard_values_);
   rwlock_->UNLock();
+  auto end = GetCurrentUS();
+
+  auto varname = _config.common().table_name();
+  VLOG(0) << "load " << varname << " with value: " << path
+          << " , meta: " << param
+          << " using: " << std::to_string((end - begin) / 1e+6) << " seconds";
+
   return 0;
 }
 
 int32_t CommonSparseTable::save(const std::string& dirname,
                                 const std::string& param) {
+  auto begin = GetCurrentUS();
   rwlock_->WRLock();
   int mode = std::stoi(param);
   VLOG(3) << "sparse table save: " << dirname << " mode: " << mode;
@@ -356,36 +377,33 @@ int32_t CommonSparseTable::save(const std::string& dirname,
   VLOG(3) << "save " << varname << " in dir: " << var_store << " begin";
   std::vector<std::string> params(_config.common().params().begin(),
                                   _config.common().params().end());
+
   std::string shard_var_pre =
       string::Sprintf("%s.block%d", varname, _shard_idx);
 
   std::string value_ = string::Sprintf("%s/%s.txt", var_store, shard_var_pre);
 
-  std::unique_ptr<std::ofstream> value_out(new std::ofstream(value_));
+  std::unique_ptr<std::ofstream> vs(new std::ofstream(value_));
 
   int64_t total_ins = 0;
   for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) {
     // save values
-    total_ins += SaveToText(value_out.get(), shard_values_[shard_id], mode);
+    auto shard_save_num = SaveValueToText(vs.get(), shard_values_[shard_id],
+                                          _shards_task_pool[shard_id], mode);
+    total_ins += shard_save_num;
   }
-  value_out->close();
+  vs->close();
 
-  // save meta
-  std::stringstream stream;
-  stream << "param=" << _config.common().table_name() << "\n";
-  stream << "shard_id=" << _shard_idx << "\n";
-  stream << "row_names="
-         << paddle::string::join_strings(_config.common().params(), ',')
-         << "\n";
-  stream << "row_dims="
-         << paddle::string::join_strings(_config.common().dims(), ',') << "\n";
-  stream << "count=" << total_ins << "\n";
   std::string meta_ = string::Sprintf("%s/%s.meta", var_store, shard_var_pre);
-  std::unique_ptr<std::ofstream> meta_out(new std::ofstream(meta_));
-  meta_out->write(stream.str().c_str(), sizeof(char) * stream.str().size());
-  meta_out->close();
-  VLOG(3) << "save " << varname << " in dir: " << var_store << " done";
+  std::unique_ptr<std::ofstream> ms(new std::ofstream(meta_));
+  SaveMetaToText(ms.get(), _config.common(), _shard_idx, total_ins);
+  ms->close();
+
+  auto end = GetCurrentUS();
   rwlock_->UNLock();
+  VLOG(0) << "save " << varname << " with path: " << value_
+          << " using: " << std::to_string((end - begin) / 1e+6) << " seconds";
+
   return 0;
 }
 
@@ -403,8 +421,6 @@ std::pair<int64_t, int64_t> CommonSparseTable::print_table_stat() {
 }
 
 int32_t CommonSparseTable::pour() {
-  rwlock_->RDLock();
-
   std::vector<float> values;
   std::vector<uint64_t> keys;
 
@@ -421,14 +437,11 @@ int32_t CommonSparseTable::pour() {
   _push_sparse(keys.data(), values.data(), pull_reservoir_.size());
 
   pull_reservoir_.clear();
-  rwlock_->UNLock();
   return 0;
 }
 
 int32_t CommonSparseTable::pull_sparse(float* pull_values,
                                        const PullSparseValue& pull_value) {
-  rwlock_->RDLock();
-
   auto shard_num = task_pool_size_;
   std::vector<std::future<int>> tasks(shard_num);
 
@@ -464,7 +477,6 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values,
   for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
     tasks[shard_id].wait();
   }
-  rwlock_->UNLock();
   return 0;
 }
 
@@ -507,7 +519,6 @@ int32_t CommonSparseTable::pull_sparse_ptr(char** pull_values,
 
 int32_t CommonSparseTable::_push_sparse(const uint64_t* keys,
                                         const float* values, size_t num) {
-  rwlock_->RDLock();
   std::vector<std::vector<uint64_t>> offset_bucket;
   offset_bucket.resize(task_pool_size_);
 
@@ -531,7 +542,6 @@ int32_t CommonSparseTable::_push_sparse(const uint64_t* keys,
   for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
     tasks[shard_id].wait();
   }
-  rwlock_->UNLock();
   return 0;
 }
 
@@ -569,7 +579,6 @@ int32_t CommonSparseTable::push_sparse(const uint64_t* keys,
 
 int32_t CommonSparseTable::_push_sparse(const uint64_t* keys,
                                         const float** values, size_t num) {
-  rwlock_->RDLock();
   std::vector<std::vector<uint64_t>> offset_bucket;
   offset_bucket.resize(task_pool_size_);
 
@@ -596,14 +605,11 @@ int32_t CommonSparseTable::_push_sparse(const uint64_t* keys,
   for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
     tasks[shard_id].wait();
   }
-  rwlock_->UNLock();
   return 0;
 }
 
 int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys,
                                              const float* values, size_t num) {
-  rwlock_->RDLock();
-
   std::vector<std::vector<uint64_t>> offset_bucket;
   offset_bucket.resize(task_pool_size_);
 
@@ -635,14 +641,12 @@ int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys,
   for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
     tasks[shard_id].wait();
   }
-  rwlock_->UNLock();
   return 0;
 }
 
 int32_t CommonSparseTable::flush() { return 0; }
 
 int32_t CommonSparseTable::shrink(const std::string& param) {
-  rwlock_->WRLock();
   int threshold = std::stoi(param);
   VLOG(3) << "sparse table shrink: " << threshold;
 
@@ -651,7 +655,6 @@ int32_t CommonSparseTable::shrink(const std::string& param) {
     VLOG(4) << shard_id << " " << task_pool_size_ << " begin shrink";
     shard_values_[shard_id]->Shrink(threshold);
   }
-  rwlock_->UNLock();
   return 0;
 }
 
diff --git a/paddle/fluid/distributed/table/graph_edge.cc b/paddle/fluid/distributed/table/graph_edge.cc
deleted file mode 100644
index cc90f4c6516c1873b078b96c550d0d52ac5d3b9c..0000000000000000000000000000000000000000
--- a/paddle/fluid/distributed/table/graph_edge.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/distributed/table/graph_edge.h"
-#include <cstring>
-namespace paddle {
-namespace distributed {
-
-void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
-  id_arr.push_back(id);
-}
-
-void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
-  id_arr.push_back(id);
-  weight_arr.push_back(weight);
-}
-}
-}
diff --git a/paddle/fluid/distributed/table/graph_edge.h b/paddle/fluid/distributed/table/graph_edge.h
deleted file mode 100644
index 3dfe5a6f357a7cd7d79834a20b6411995665f4fa..0000000000000000000000000000000000000000
--- a/paddle/fluid/distributed/table/graph_edge.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-namespace paddle {
-namespace distributed {
-
-class GraphEdgeBlob {
- public:
-  GraphEdgeBlob() {}
-  virtual ~GraphEdgeBlob() {}
-  size_t size() { return id_arr.size(); }
-  virtual void add_edge(uint64_t id, float weight);
-  uint64_t get_id(int idx) { return id_arr[idx]; }
-  virtual float get_weight(int idx) { return 1; }
-
- protected:
-  std::vector<uint64_t> id_arr;
-};
-
-class WeightedGraphEdgeBlob : public GraphEdgeBlob {
- public:
-  WeightedGraphEdgeBlob() {}
-  virtual ~WeightedGraphEdgeBlob() {}
-  virtual void add_edge(uint64_t id, float weight);
-  virtual float get_weight(int idx) { return weight_arr[idx]; }
-
- protected:
-  std::vector<float> weight_arr;
-};
-}
-}
diff --git a/paddle/fluid/distributed/table/graph_node.cc b/paddle/fluid/distributed/table/graph_node.cc
deleted file mode 100644
index 27a2cafaf4f0fec95de818204ebd191a5083e50a..0000000000000000000000000000000000000000
--- a/paddle/fluid/distributed/table/graph_node.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/distributed/table/graph_node.h"
-#include <cstring>
-namespace paddle {
-namespace distributed {
-
-GraphNode::~GraphNode() {
-  if (sampler != nullptr) {
-    delete sampler;
-    sampler = nullptr;
-  }
-  if (edges != nullptr) {
-    delete edges;
-    edges = nullptr;
-  }
-}
-
-int Node::weight_size = sizeof(float);
-int Node::id_size = sizeof(uint64_t);
-int Node::int_size = sizeof(int);
-
-int Node::get_size(bool need_feature) { return id_size + int_size; }
-
-void Node::to_buffer(char* buffer, bool need_feature) {
-  memcpy(buffer, &id, id_size);
-  buffer += id_size;
-
-  int feat_num = 0;
-  memcpy(buffer, &feat_num, sizeof(int));
-}
-
-void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); }
-
-int FeatureNode::get_size(bool need_feature) {
-  int size = id_size + int_size;  // id, feat_num
-  if (need_feature) {
-    size += feature.size() * int_size;
-    for (const std::string& fea : feature) {
-      size += fea.size();
-    }
-  }
-  return size;
-}
-
-void GraphNode::build_edges(bool is_weighted) {
-  if (edges == nullptr) {
-    if (is_weighted == true) {
-      edges = new WeightedGraphEdgeBlob();
-    } else {
-      edges = new GraphEdgeBlob();
-    }
-  }
-}
-void GraphNode::build_sampler(std::string sample_type) {
-  if (sample_type == "random") {
-    sampler = new RandomSampler();
-  } else if (sample_type == "weighted") {
-    sampler = new WeightedSampler();
-  }
-  sampler->build(edges);
-}
-void FeatureNode::to_buffer(char* buffer, bool need_feature) {
-  memcpy(buffer, &id, id_size);
-  buffer += id_size;
-
-  int feat_num = 0;
-  int feat_len;
-  if (need_feature) {
-    feat_num += feature.size();
-    memcpy(buffer, &feat_num, sizeof(int));
-    buffer += sizeof(int);
-    for (int i = 0; i < feat_num; ++i) {
-      feat_len = feature[i].size();
-      memcpy(buffer, &feat_len, sizeof(int));
-      buffer += sizeof(int);
-      memcpy(buffer, feature[i].c_str(), feature[i].size());
-      buffer += feature[i].size();
-    }
-  } else {
-    memcpy(buffer, &feat_num, sizeof(int));
-  }
-}
-void FeatureNode::recover_from_buffer(char* buffer) {
-  int feat_num, feat_len;
-  memcpy(&id, buffer, id_size);
-  buffer += id_size;
-
-  memcpy(&feat_num, buffer, sizeof(int));
-  buffer += sizeof(int);
-
-  feature.clear();
-  for (int i = 0; i < feat_num; ++i) {
-    memcpy(&feat_len, buffer, sizeof(int));
-    buffer += sizeof(int);
-
-    char str[feat_len + 1];
-    memcpy(str, buffer, feat_len);
-    buffer += feat_len;
-    str[feat_len] = '\0';
-    feature.push_back(std::string(str));
-  }
-}
-}
-}
diff --git a/paddle/fluid/distributed/table/graph_node.h b/paddle/fluid/distributed/table/graph_node.h
deleted file mode 100644
index c3e8e3ce5b50d06945857ded1db168f84f955c5f..0000000000000000000000000000000000000000
--- a/paddle/fluid/distributed/table/graph_node.h
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cstring>
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include "paddle/fluid/distributed/table/graph_weighted_sampler.h"
-namespace paddle {
-namespace distributed {
-
-class Node {
- public:
-  Node() {}
-  Node(uint64_t id) : id(id) {}
-  virtual ~Node() {}
-  static int id_size, int_size, weight_size;
-  uint64_t get_id() { return id; }
-  void set_id(uint64_t id) { this->id = id; }
-
-  virtual void build_edges(bool is_weighted) {}
-  virtual void build_sampler(std::string sample_type) {}
-  virtual void add_edge(uint64_t id, float weight) {}
-  virtual std::vector<int> sample_k(int k) { return std::vector<int>(); }
-  virtual uint64_t get_neighbor_id(int idx) { return 0; }
-  virtual float get_neighbor_weight(int idx) { return 1.; }
-
-  virtual int get_size(bool need_feature);
-  virtual void to_buffer(char *buffer, bool need_feature);
-  virtual void recover_from_buffer(char *buffer);
-  virtual std::string get_feature(int idx) { return std::string(""); }
-  virtual void set_feature(int idx, std::string str) {}
-  virtual void set_feature_size(int size) {}
-  virtual int get_feature_size() { return 0; }
-
- protected:
-  uint64_t id;
-};
-
-class GraphNode : public Node {
- public:
-  GraphNode() : Node(), sampler(nullptr), edges(nullptr) {}
-  GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {}
-  virtual ~GraphNode();
-  virtual void build_edges(bool is_weighted);
-  virtual void build_sampler(std::string sample_type);
-  virtual void add_edge(uint64_t id, float weight) {
-    edges->add_edge(id, weight);
-  }
-  virtual std::vector<int> sample_k(int k) { return sampler->sample_k(k); }
-  virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
-  virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
-
- protected:
-  Sampler *sampler;
-  GraphEdgeBlob *edges;
-};
-
-class FeatureNode : public Node {
- public:
-  FeatureNode() : Node() {}
-  FeatureNode(uint64_t id) : Node(id) {}
-  virtual ~FeatureNode() {}
-  virtual int get_size(bool need_feature);
-  virtual void to_buffer(char *buffer, bool need_feature);
-  virtual void recover_from_buffer(char *buffer);
-  virtual std::string get_feature(int idx) {
-    if (idx < (int)this->feature.size()) {
-      return this->feature[idx];
-    } else {
-      return std::string("");
-    }
-  }
-
-  virtual void set_feature(int idx, std::string str) {
-    if (idx >= (int)this->feature.size()) {
-      this->feature.resize(idx + 1);
-    }
-    this->feature[idx] = str;
-  }
-  virtual void set_feature_size(int size) { this->feature.resize(size); }
-  virtual int get_feature_size() { return this->feature.size(); }
-
-  template <typename T>
-  static std::string parse_value_to_bytes(std::vector<std::string> feat_str) {
-    T v;
-    size_t Tsize = sizeof(T) * feat_str.size();
-    char buffer[Tsize];
-    for (size_t i = 0; i < feat_str.size(); i++) {
-      std::stringstream ss(feat_str[i]);
-      ss >> v;
-      std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T));
-    }
-    return std::string(buffer, Tsize);
-  }
-
-  template <typename T>
-  static std::vector<T> parse_bytes_to_array(std::string feat_str) {
-    T v;
-    std::vector<T> out;
-    size_t start = 0;
-    const char *buffer = feat_str.data();
-    while (start < feat_str.size()) {
-      std::memcpy((char *)&v, buffer + start, sizeof(T));
-      start += sizeof(T);
-      out.push_back(v);
-    }
-    return out;
-  }
-
- protected:
-  std::vector<std::string> feature;
-};
-}
-}
diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph_weighted_sampler.cc
deleted file mode 100644
index 059a1d64bc392d7ef6936c008bbeec3bef3a5fb9..0000000000000000000000000000000000000000
--- a/paddle/fluid/distributed/table/graph_weighted_sampler.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/distributed/table/graph_weighted_sampler.h"
-#include <iostream>
-#include <unordered_map>
-namespace paddle {
-namespace distributed {
-
-void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; }
-
-std::vector<int> RandomSampler::sample_k(int k) {
-  int n = edges->size();
-  if (k > n) {
-    k = n;
-  }
-  struct timespec tn;
-  clock_gettime(CLOCK_REALTIME, &tn);
-  srand(tn.tv_nsec);
-  std::vector<int> sample_result;
-  std::unordered_map<int, int> replace_map;
-  while (k--) {
-    int rand_int = rand() % n;
-    auto iter = replace_map.find(rand_int);
-    if (iter == replace_map.end()) {
-      sample_result.push_back(rand_int);
-    } else {
-      sample_result.push_back(iter->second);
-    }
-
-    iter = replace_map.find(n - 1);
-    if (iter == replace_map.end()) {
-      replace_map[rand_int] = n - 1;
-    } else {
-      replace_map[rand_int] = iter->second;
-    }
-    --n;
-  }
-  return sample_result;
-}
-
-WeightedSampler::WeightedSampler() {
-  left = nullptr;
-  right = nullptr;
-  edges = nullptr;
-}
-
-WeightedSampler::~WeightedSampler() {
-  if (left != nullptr) {
-    delete left;
-    left = nullptr;
-  }
-  if (right != nullptr) {
-    delete right;
-    right = nullptr;
-  }
-}
-
-void WeightedSampler::build(GraphEdgeBlob *edges) {
-  if (left != nullptr) {
-    delete left;
-    left = nullptr;
-  }
-  if (right != nullptr) {
-    delete right;
-    right = nullptr;
-  }
-  return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size());
-}
-
-void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start,
-                                int end) {
-  count = 0;
-  this->edges = edges;
-  if (start + 1 == end) {
-    left = right = nullptr;
-    idx = start;
-    count = 1;
-    weight = edges->get_weight(idx);
-
-  } else {
-    left = new WeightedSampler();
-    right = new WeightedSampler();
-    left->build_one(edges, start, start + (end - start) / 2);
-    right->build_one(edges, start + (end - start) / 2, end);
-    weight = left->weight + right->weight;
-    count = left->count + right->count;
-  }
-}
-std::vector<int> WeightedSampler::sample_k(int k) {
-  if (k > count) {
-    k = count;
-  }
-  std::vector<int> sample_result;
-  float subtract;
-  std::unordered_map<WeightedSampler *, float> subtract_weight_map;
-  std::unordered_map<WeightedSampler *, int> subtract_count_map;
-  struct timespec tn;
-  clock_gettime(CLOCK_REALTIME, &tn);
-  srand(tn.tv_nsec);
-  while (k--) {
-    float query_weight = rand() % 100000 / 100000.0;
-    query_weight *= weight - subtract_weight_map[this];
-    sample_result.push_back(sample(query_weight, subtract_weight_map,
-                                   subtract_count_map, subtract));
-  }
-  return sample_result;
-}
-
-int WeightedSampler::sample(
-    float query_weight,
-    std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
-    std::unordered_map<WeightedSampler *, int> &subtract_count_map,
-    float &subtract) {
-  if (left == nullptr) {
-    subtract_weight_map[this] = weight;
-    subtract = weight;
-    subtract_count_map[this] = 1;
-    return idx;
-  }
-  int left_count = left->count - subtract_count_map[left];
-  int right_count = right->count - subtract_count_map[right];
-  float left_subtract = subtract_weight_map[left];
-  int return_idx;
-  if (right_count == 0 ||
-      left_count > 0 && left->weight - left_subtract >= query_weight) {
-    return_idx = left->sample(query_weight, subtract_weight_map,
-                              subtract_count_map, subtract);
-  } else {
-    return_idx =
-        right->sample(query_weight - (left->weight - left_subtract),
-                      subtract_weight_map, subtract_count_map, subtract);
-  }
-  subtract_weight_map[this] += subtract;
-  subtract_count_map[this]++;
-  return return_idx;
-}
-}
-}
diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph_weighted_sampler.h
deleted file mode 100644
index cfc341d27c6b766fcee57e8973a4353d4fe93b4e..0000000000000000000000000000000000000000
--- a/paddle/fluid/distributed/table/graph_weighted_sampler.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <ctime>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/distributed/table/graph_edge.h"
-namespace paddle {
-namespace distributed {
-
-class Sampler {
- public:
-  virtual ~Sampler() {}
-  virtual void build(GraphEdgeBlob *edges) = 0;
-  virtual std::vector<int> sample_k(int k) = 0;
-};
-
-class RandomSampler : public Sampler {
- public:
-  virtual ~RandomSampler() {}
-  virtual void build(GraphEdgeBlob *edges);
-  virtual std::vector<int> sample_k(int k);
-  GraphEdgeBlob *edges;
-};
-
-class WeightedSampler : public Sampler {
- public:
-  WeightedSampler();
-  virtual ~WeightedSampler();
-  WeightedSampler *left, *right;
-  float weight;
-  int count;
-  int idx;
-  GraphEdgeBlob *edges;
-  virtual void build(GraphEdgeBlob *edges);
-  virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end);
-  virtual std::vector<int> sample_k(int k);
-
- private:
-  int sample(float query_weight,
-             std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
-             std::unordered_map<WeightedSampler *, int> &subtract_count_map,
-             float &subtract);
-};
-}
-}
diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h
index 81a1ff5eced2bb36b8f917a31de1e214b272bfa3..55fc92c9b57859772e05ebee0f0cb084ddcfa04a 100644
--- a/paddle/fluid/distributed/table/table.h
+++ b/paddle/fluid/distributed/table/table.h
@@ -36,7 +36,7 @@ class Table {
   Table() {}
   virtual ~Table() {}
   virtual int32_t initialize(const TableParameter &config,
-                             const FsClientParameter &fs_config) final;
+                             const FsClientParameter &fs_config);
 
   virtual int32_t pull_dense(float *values, size_t num) = 0;
   virtual int32_t push_dense(const float *values, size_t num) = 0;
@@ -58,7 +58,9 @@ class Table {
   virtual int32_t push_sparse(const uint64_t *keys, const float *values,
                               size_t num) = 0;
   virtual int32_t push_sparse(const uint64_t *keys, const float **values,
-                              size_t num){};
+                              size_t num) {
+    return 0;
+  }
   virtual int32_t push_sparse_param(const uint64_t *keys, const float *values,
                                     size_t num) {
     return 0;
@@ -108,7 +110,7 @@ class Table {
   virtual int32_t save(const std::string &path,
                        const std::string &converter) = 0;
 
-  virtual int32_t set_shard(size_t shard_idx, size_t shard_num) final {
+  virtual int32_t set_shard(size_t shard_idx, size_t shard_num) {
     _shard_idx = shard_idx;
     _shard_num = shard_num;
     return initialize_shard();
@@ -123,7 +125,7 @@ class Table {
 
  protected:
   virtual int32_t initialize() = 0;
-  virtual int32_t initialize_accessor() final;
+  virtual int32_t initialize_accessor();
   virtual int32_t initialize_shard() = 0;
   virtual std::string table_dir(const std::string &model_dir) {
     return paddle::string::format_string("%s/%03d/", model_dir.c_str(),
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index b268bb449e14619048e89c8933dbae7daf66537b..b8630aed02ffe60181ddb6b41810f5bea602b733 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -124,7 +124,6 @@ void testSingleSampleNeighboor(
   for (auto g : s) {
     ASSERT_EQ(true, s1.find(g) != s1.end());
   }
-  VLOG(0) << "test single done";
   s.clear();
   s1.clear();
   vs.clear();
@@ -141,6 +140,57 @@ void testSingleSampleNeighboor(
   }
 }
 
+void testAddNode(
+    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
+  worker_ptr_->clear_nodes(0);
+  int total_num = 270000;
+  uint64_t id;
+  std::unordered_set<uint64_t> id_set;
+  for (int i = 0; i < total_num; i++) {
+    while (id_set.find(id = rand()) != id_set.end())
+      ;
+    id_set.insert(id);
+  }
+  std::vector<uint64_t> id_list(id_set.begin(), id_set.end());
+  std::vector<bool> weight_list;
+  auto status = worker_ptr_->add_graph_node(0, id_list, weight_list);
+  status.wait();
+  std::vector<uint64_t> ids[2];
+  for (int i = 0; i < 2; i++) {
+    auto sample_status =
+        worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
+    sample_status.wait();
+  }
+  std::unordered_set<uint64_t> id_set_check(ids[0].begin(), ids[0].end());
+  for (auto x : ids[1]) id_set_check.insert(x);
+  ASSERT_EQ(id_set.size(), id_set_check.size());
+  for (auto x : id_set) {
+    ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true);
+  }
+  std::vector<uint64_t> remove_ids;
+  for (auto p : id_set_check) {
+    if (remove_ids.size() == 0)
+      remove_ids.push_back(p);
+    else if (remove_ids.size() < total_num / 2 && rand() % 2 == 1) {
+      remove_ids.push_back(p);
+    }
+  }
+  for (auto p : remove_ids) id_set_check.erase(p);
+  status = worker_ptr_->remove_graph_node(0, remove_ids);
+  status.wait();
+  for (int i = 0; i < 2; i++) ids[i].clear();
+  for (int i = 0; i < 2; i++) {
+    auto sample_status =
+        worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
+    sample_status.wait();
+  }
+  std::unordered_set<uint64_t> id_set_check1(ids[0].begin(), ids[0].end());
+  for (auto x : ids[1]) id_set_check1.insert(x);
+  ASSERT_EQ(id_set_check1.size(), id_set_check.size());
+  for (auto x : id_set_check1) {
+    ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true);
+  }
+}
 void testBatchSampleNeighboor(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
   std::vector<std::vector<std::pair<uint64_t, float>>> vs;
@@ -527,6 +577,7 @@ void RunBrpcPushSparse() {
 
   std::remove(edge_file_name);
   std::remove(node_file_name);
+  testAddNode(worker_ptr_);
   LOG(INFO) << "Run stop_server";
   worker_ptr_->stop_server();
   LOG(INFO) << "Run finalize_worker";
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 2e9c81caf53a4f643c93909008412b80a892bfa8..0b583931054b161ce09a9072359935cb5ccfd4c9 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -27,6 +27,7 @@ add_subdirectory(fleet)
 add_subdirectory(io)
 #ddim lib
 proto_library(framework_proto SRCS framework.proto)
+proto_library(op_def_proto SRCS op_def.proto)
 proto_library(heter_service_proto SRCS heter_service.proto)
 proto_library(data_feed_proto SRCS data_feed.proto)
 proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
@@ -287,6 +288,15 @@ if(WITH_DISTRIBUTE)
             graph_to_program_pass variable_helper timer monitor)
   endif()
 elseif(WITH_PSLIB)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(DISTRIBUTE_COMPILE_FLAGS
+              "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+  endif()
+  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
   dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
   heterxpu_trainer.cc
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 8ff94b0277c0cb894ec5c324e0bee962004bb6ee..8708d90485af8fffab7a5c04d3c132e1ced82364 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -143,7 +143,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
 
 void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
                                     const Tensor& in, Tensor* out,
-                                    platform::Place place) {
+                                    platform::Place place, bool always_copy) {
   PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::undef,
                     platform::errors::InvalidArgument(
                         "Input tensor format is invalid. Input tensor should "
@@ -177,7 +177,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
   // output tensor has the same dims as input. Reorder don't change dims
   out->Resize(in.dims());
 
-  if (in_format != out_format) {
+  if ((in_format != out_format) || always_copy) {
     void* in_data = GetDataFromTensor(in, in_type);
     std::string key =
         platform::CreateKey(*dev_ctx, in_tz, in_format, out_format, in_type);
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 238f2d2e67914c7ae1443d09cf915439ebad4dd5..3404ba2db67e5f0e90203d7ee0bb238bb377af0f 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -78,7 +78,8 @@ inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) {
 
 void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
                                     const Tensor& in, Tensor* out,
-                                    platform::Place place);
+                                    platform::Place place,
+                                    bool always_copy = false);
 
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                                const OpKernelType& expected_kernel_type,
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index c8f73a5469ab32a5734d980010a52a6f72eb6ca8..648a32420aa6c0d12545c72ff8cec778d817d7e9 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/eigen_ext.h"
@@ -30,6 +31,8 @@ struct bfloat16;
 struct complex128;
 struct complex64;
 struct float16;
+template <typename T>
+struct complex;
 }  // namespace platform
 }  // namespace paddle
 
@@ -61,6 +64,10 @@ struct DataTypeTrait<void> {
   _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                           \
   _ForEachDataTypeHelper_(callback, int16_t, INT16);                           \
   _ForEachDataTypeHelper_(callback, int8_t, INT8);                             \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,        \
+                          COMPLEX64);                                          \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>,       \
+                          COMPLEX128);                                         \
   _ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \
   _ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128);
 
@@ -69,6 +76,10 @@ struct DataTypeTrait<void> {
   _ForEachDataTypeHelper_(callback, double, FP64);                             \
   _ForEachDataTypeHelper_(callback, int, INT32);                               \
   _ForEachDataTypeHelper_(callback, int64_t, INT64);                           \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,        \
+                          COMPLEX64);                                          \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>,       \
+                          COMPLEX128);                                         \
   _ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \
   _ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128);
 
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 0fdb97db20af992998d94e37263f415a84cd1ba1..829772448eb91e428224647168029395d95ab9f6 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -163,6 +163,11 @@ static void PrintNanInf(const T* value, const size_t numel, int print_num,
                               omp_in)
 #pragma omp declare reduction(+ : paddle::platform::complex128 : omp_out += \
                               omp_in)
+#pragma omp declare reduction(+ : paddle::platform::complex < \
+                                  float > : omp_out += omp_in)
+#pragma omp declare reduction(+ : paddle::platform::complex < \
+                                  double > : omp_out += omp_in)
+
 #endif
 
 template <typename T>
@@ -268,12 +273,69 @@ void CheckNanInf<paddle::platform::complex128>(
         op_type));
   }
 }
+
+template <>
+void CheckNanInf<paddle::platform::complex<float>>(
+    const paddle::platform::complex<float>* value, const size_t numel,
+    int print_num, const std::string& op_type, const std::string& var_name) {
+  float real_sum = 0.0f;
+#pragma omp parallel for reduction(+ : real_sum)
+  for (size_t i = 0; i < numel; ++i) {
+    real_sum += (value[i].real - value[i].real);
+  }
+
+  float imag_sum = 0.0f;
+#pragma omp parallel for reduction(+ : imag_sum)
+  for (size_t i = 0; i < numel; ++i) {
+    imag_sum += (value[i].imag - value[i].imag);
+  }
+
+  if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
+      std::isinf(imag_sum)) {
+    // hot fix for compile failed in gcc4.8
+    // here also need print detail info of nan or inf later
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "There are `nan` or `inf` in tensor (%s) of operator (%s).", var_name,
+        op_type));
+  }
+}
+
+template <>
+    void CheckNanInf<paddle::platform::complex<double>>>
+    (const paddle::platform::complex<double>* value, const size_t numel,
+     int print_num, const std::string& op_type, const std::string& var_name) {
+  double real_sum = 0.0;
+#pragma omp parallel for reduction(+ : real_sum)
+  for (size_t i = 0; i < numel; ++i) {
+    real_sum += (value[i].real - value[i].real);
+  }
+
+  double imag_sum = 0.0;
+#pragma omp parallel for reduction(+ : imag_sum)
+  for (size_t i = 0; i < numel; ++i) {
+    imag_sum += (value[i].imag - value[i].imag);
+  }
+
+  if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
+      std::isinf(imag_sum)) {
+    // hot fix for compile failed in gcc4.8
+    // here also need print detail info of nan or inf later
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "There are `nan` or `inf` in tensor (%s) of operator (%s).", var_name,
+        op_type));
+  }
+}
+
 #endif
 
 template <>
 template <typename T>
 void TensorCheckerVisitor<platform::CPUDeviceContext>::apply(
-    typename std::enable_if<std::is_floating_point<T>::value>::type*) const {
+    typename std::enable_if<
+        std::is_floating_point<T>::value ||
+        std::is_same<T, ::paddle::platform::complex<float>>::value ||
+        std::is_same<T, ::paddle::platform::complex<double>>::value>::type*)
+    const {
   // use env strategy control in future, -1=print_all.
   int print_num = 3;
   CheckNanInf(tensor_.data<T>(), tensor_.numel(), print_num, op_type_,
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index 96d1a9fb94927debf8525fdc8b9597f08eeb7129..a9ea336e42545720df3f7226dac51531b26ebfff 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -123,7 +123,11 @@ __global__ void CheckNanInfKernel(const T* value, const size_t numel,
 template <>
 template <typename T>
 void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
-    typename std::enable_if<std::is_floating_point<T>::value>::type*) const {
+    typename std::enable_if<
+        std::is_floating_point<T>::value ||
+        std::is_same<T, ::paddle::platform::complex<float>>::value ||
+        std::is_same<T, ::paddle::platform::complex<double>>::value>::type*)
+    const {
   int print_num = 3;
 
   auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h
index b4459e5a7c1cc6ad6faa9e19f39bff47fe128344..10b7ab0bc9c534faee7be0a20182ad96c4550844 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.h
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h
@@ -46,8 +46,12 @@ struct TensorCheckerVisitor {
   }
 
   template <typename T>
-  void apply(typename std::enable_if<std::is_floating_point<T>::value>::type* =
-                 0) const;
+  void apply(
+      typename std::enable_if<
+          std::is_floating_point<T>::value ||
+          std::is_same<T, ::paddle::platform::complex<float>>::value ||
+          std::is_same<T, ::paddle::platform::complex<double>>::value>::type* =
+          0) const;
 
   std::string op_type_;
   std::string var_name_;
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 84369011476c77765dc5396830adc34f775fbb50..db83cd55889c43feadaab2dd4170b5e90d117435 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -195,6 +195,9 @@ class DeviceWorker {
   virtual void SetReaderPlace(const paddle::platform::Place& place) {
     device_reader_->SetPlace(place);
   }
+  virtual void SetDeviceContext(platform::DeviceContext* dev_ctx) {
+    dev_ctx_ = dev_ctx;
+  }
   virtual Scope* GetThreadScope() { return thread_scope_; }
   DataFeed* device_reader_ = nullptr;
 
@@ -221,6 +224,7 @@ class DeviceWorker {
   int dump_mode_ = 0;
   int dump_interval_ = 10000;
   ChannelWriter<std::string> writer_;
+  platform::DeviceContext* dev_ctx_ = nullptr;
 };
 
 class CPUWorkerBase : public DeviceWorker {
@@ -266,9 +270,6 @@ class HogwildWorker : public CPUWorkerBase {
   HogwildWorkerParameter param_;
   std::vector<std::string> skip_ops_;
   std::map<std::string, int> stat_var_name_map_;
-#ifdef PADDLE_WITH_HETERPS
-  platform::DeviceContext* dev_ctx_ = nullptr;
-#endif
 };
 
 class DownpourWorker : public HogwildWorker {
@@ -622,7 +623,6 @@ class PSGPUWorker : public HogwildWorker {
   gpuStream_t copy_stream_;
   int batch_cnt_{0};
   std::atomic<int> done_cnt_{0};
-  platform::DeviceContext* dev_ctx_ = nullptr;
 
   double total_time_;
   double read_time_;
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index d102fcdbe0cec14144d54f82e62760e8a1ceaec2..181e3b68853801460c87162badb553c90ab7ccb5 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -141,6 +141,7 @@ message PipelineConfig {
 
 message TensorParallelConfig {
   optional int32 tensor_parallel_degree = 1 [ default = 1 ];
+  optional int32 tensor_init_seed = 2 [ default = -1 ];
 }
 
 message DistributedStrategy {
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 3833b027d2a364d7a46d01540983a1637de25376..54d8fc92b29459ec062b6809ef4cd5156d50c21a 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -28,9 +28,19 @@ namespace internal {
 template <typename T>
 static ::DLDataType GetDLDataTypeCode() {
   ::DLDataType dtype;
-  if (std::is_same<T, platform::float16>::value ||
-      std::is_same<T, platform::bfloat16>::value ||
-      std::is_floating_point<T>::value) {
+  if (std::is_same<T, platform::complex<float>>::value ||
+      std::is_same<T, platform::complex<double>>::value ||
+      std::is_same<T, platform::complex64>::value ||
+      std::is_same<T, platform::complex128>::value) {
+    // The current dlpack library version is v0.2, and does not define
+    // kDLComplex value. But kDLComplex is defined by 5U in v0.4, so we set
+    // dtype.code to 5U directly here. After the dlpack library version being
+    // upgraded to v0.4, it should be written as follow.
+    // dtype.code = kDLComplex;
+    dtype.code = 5U;
+  } else if (std::is_same<T, platform::float16>::value ||
+             std::is_same<T, platform::bfloat16>::value ||
+             std::is_floating_point<T>::value) {
     dtype.code = kDLFloat;
   } else if (std::is_unsigned<T>::value) {
     dtype.code = kDLUInt;
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index d03437034d62ad0e4249a96d71f5f7544647e704..1a79ada0be7c620eab3e64e8ba600f557af6d39e 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -28,6 +28,13 @@ namespace framework {
 namespace {  // NOLINT
 template <typename T>
 constexpr uint8_t GetDLDataTypeCode() {
+  if (std::is_same<T, platform::complex<float>>::value ||
+      std::is_same<T, platform::complex<double>>::value ||
+      std::is_same<T, platform::complex64>::value ||
+      std::is_same<T, platform::complex128>::value) {
+    return static_cast<uint8_t>(5);
+  }
+
   return std::is_same<platform::float16, T>::value ||
                  std::is_floating_point<T>::value
              ? static_cast<uint8_t>(kDLFloat)
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index b2d170888e28fc4e9918c26f000a5983c09811ee..0c66622ed7b9a6a6e9fb5112001009c2b95e367a 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -39,9 +39,6 @@ void HogwildWorker::Initialize(const TrainerDesc &desc) {
   for (int i = 0; i < param_.stat_var_names_size(); ++i) {
     stat_var_name_map_[param_.stat_var_names(i)] = 1;
   }
-#ifdef PADDLE_WITH_HETERPS
-  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
-#endif
 }
 
 void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) {
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index ab69170322ce3ec4eaa8e46b53e490b634df64b7..7e7f1fed5ad58db25909c25ca60f5eac80a5f478 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -50,8 +50,9 @@ if (WITH_TESTING)
 endif(WITH_TESTING)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS ${GRAPH_PATTERN_DETECTOR_DEPS})
 
+cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector)
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS graph_pattern_detector executor)
-cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
+cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS op_compat_sensible_pass)
 cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass)
 
 cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper)
@@ -139,6 +140,7 @@ cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
+cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass)
 cc_test(test_fc_fuse_pass_cc SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_fc_lstm_fuse_pass_cc SRCS fc_lstm_fuse_pass_tester.cc DEPS fc_lstm_fuse_pass framework_proto)
 cc_test(test_fc_gru_fuse_pass_cc SRCS fc_gru_fuse_pass_tester.cc DEPS fc_gru_fuse_pass framework_proto)
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h
index ce7635bb35ce6108b4a5a356c8fb99269dbf2890..bc5fc2a16d3939648f53e91f6cd3f4f0def0fd93 100644
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -17,7 +17,7 @@
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
@@ -46,7 +46,7 @@ enum FuseOptions {
   FUSE_MKLDNN   // fusing will be done with MKL-DNN
 };
 
-class FusePassBase : public Pass {
+class FusePassBase : public OpCompatSensiblePass {
  public:
   void Init(const std::string& repr, Graph* graph) const;
   Scope* param_scope() const;
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b056c3b07a2f65bf0756285857edd3355b591c29
--- /dev/null
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
@@ -0,0 +1,231 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+
+#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
+#include "paddle/fluid/framework/op_info.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+
+AttrCompat& AttrCompat::IsStringIn(const std::set<std::string>& candidates) {
+  conditions_.emplace_back([candidates](const Attribute& attr) -> bool {
+    std::string value = BOOST_GET_CONST(std::string, attr);
+    for (auto& str : candidates) {
+      if (str == value) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return *this;
+}
+
+AttrCompat& AttrCompat::IsStringMatch(
+    const std::function<bool(const std::string&)>& func) {
+  conditions_.emplace_back([func](const Attribute& attr) -> bool {
+    std::string value = BOOST_GET_CONST(std::string, attr);
+    return func(value);
+  });
+  return *this;
+}
+
+AttrCompat& AttrCompat::IsIntIn(const std::set<int>& candidates) {
+  conditions_.emplace_back([candidates](const Attribute& attr) -> bool {
+    int value = BOOST_GET_CONST(int, attr);
+    return candidates.find(value) != candidates.end();
+  });
+  return *this;
+}
+
+//! Todo: append the definition.
+AttrCompat& AttrCompat::IsLeftDefault() {
+  const std::string& op_name = op_compat_->Name();
+  if (!OpInfoMap::Instance().Has(op_name)) {
+    VLOG(3) << "Op (" << op_name << ") is not registered!";
+    conditions_.emplace_back([](const Attribute& attr) { return false; });
+    return *this;
+  }
+  const OpInfo& op_info = OpInfoMap::Instance().Get(op_name);
+  const AttributeMap attrs = op_info.Checker()->GetAttrsDefaultValuesMap();
+  if (attrs.find(attr_name_) == attrs.end()) {
+    VLOG(3) << "Op (" << op_name << ") has no default attr:" << attr_name_;
+    conditions_.emplace_back([](const Attribute& attr) { return false; });
+  } else {
+    Attribute default_attr = attrs.at(attr_name_);
+    conditions_.emplace_back([default_attr](const Attribute& attr) -> bool {
+      return attr == default_attr;
+    });
+  }
+  return *this;
+}
+
+bool AttrCompat::operator()(const OpDesc& op_desc) {
+  if (conditions_.empty()) {
+    return true;
+  }
+  if (!op_desc.HasAttr(attr_name_)) {
+    return optional_;
+  }
+  const Attribute attr = op_desc.GetAttr(attr_name_);
+  for (auto& func : conditions_) {
+    if (!func(attr)) {
+      return false;
+    }
+  }
+  return true;
+}
+AttrCompat& AttrCompat::IsOptional() {
+  optional_ = true;
+  return *this;
+}
+
+AttrCompat& AttrCompat::IsBoolEQ(bool v) {
+  conditions_.emplace_back([v](const Attribute& attr) -> bool {
+    bool value = BOOST_GET_CONST(bool, attr);
+    return value == v;
+  });
+  return *this;
+}
+
+InputOrOutputCompat& InputOrOutputCompat::IsTensor() {
+  conditions_.emplace_back([](const std::vector<std::string>& input) -> bool {
+    return input.size() == 1u;
+  });
+  return *this;
+}
+
+InputOrOutputCompat& InputOrOutputCompat::IsOptional() {
+  optional_ = true;
+  return *this;
+}
+
+bool InputOrOutputCompat::operator()(
+    const std::vector<std::string>& input) const {
+  if (input.empty()) return false;
+  for (auto& func : conditions_) {
+    if (!func(input)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+AttrCompat& OpCompat::AddAttr(const std::string& attr_name) {
+  PADDLE_ENFORCE_EQ(
+      attr_compats_.find(attr_name), attr_compats_.end(),
+      platform::errors::InvalidArgument(
+          "The attrubute compat with the same name has been added"));
+  attr_compats_.emplace(attr_name, AttrCompat(attr_name, this));
+  return attr_compats_.at(attr_name);
+}
+
+InputOrOutputCompat& OpCompat::AddInput(const std::string& name) {
+  PADDLE_ENFORCE_EQ(input_compats_.find(name), input_compats_.end(),
+                    platform::errors::InvalidArgument(
+                        "The input with the same name has been added"));
+  input_compats_.emplace(name, InputOrOutputCompat(name, this));
+  return input_compats_.at(name);
+}
+
+InputOrOutputCompat& OpCompat::AddOutput(const std::string& name) {
+  PADDLE_ENFORCE_EQ(output_compats_.find(name), output_compats_.end(),
+                    platform::errors::InvalidArgument(
+                        "The output with the same name has been added"));
+  output_compats_.emplace(name, InputOrOutputCompat(name, this));
+  return output_compats_.at(name);
+}
+
+bool OpCompat::Judge(const OpDesc& op_desc) {
+  for (auto& attr_map : op_desc.GetAttrMap()) {
+    if (attr_compats_.find(attr_map.first) == attr_compats_.end()) {
+      if (!AttrCompat(attr_map.first, this).IsLeftDefault()(op_desc)) {
+        VLOG(3) << "The Attr(" << attr_map.first << ") of Op (" << op_name_
+                << ") not reigistered in OpCompat, not equal to default value!";
+        return false;
+      }
+    }
+  }
+  for (auto& attr_compat : attr_compats_) {
+    if (!attr_compat.second(op_desc)) {
+      VLOG(3) << " Check the Attr(" << attr_compat.first << ") of Op("
+              << op_name_ << ") failed!";
+      return false;
+    }
+  }
+
+  const VariableNameMap& inputs_map = op_desc.Inputs();
+  for (auto& input_desc : inputs_map) {
+    if (input_compats_.find(input_desc.first) == input_compats_.end()) {
+      if (!input_desc.second.empty()) {
+        VLOG(3) << "The Input (" << input_desc.first << ") of Operator ("
+                << op_name_ << ") not reigistered in OpCompat!";
+        return false;
+      }
+    }
+  }
+  for (auto& input_val : input_compats_) {
+    if (inputs_map.find(input_val.first) == inputs_map.end()) {
+      if (!input_val.second.Optional()) {
+        VLOG(3) << "The No optional Input (" << input_val.first
+                << ") of Operator (" << op_name_ << ") not find in op_desc!";
+        return false;
+      }
+    } else {
+      if (!input_val.second(inputs_map.at(input_val.first))) {
+        VLOG(3) << "The Input (" << input_val.first << ") of Operator ("
+                << op_name_ << ") compat check failed!";
+        return false;
+      }
+    }
+  }
+
+  const VariableNameMap& outputs_map = op_desc.Outputs();
+  for (auto& output_desc : outputs_map) {
+    if (output_compats_.find(output_desc.first) == output_compats_.end()) {
+      if (!output_desc.second.empty()) {
+        VLOG(3) << "The Output (" << output_desc.first << ") of Operator ("
+                << op_name_ << ") not reigistered in OpCompat!";
+        return false;
+      }
+    }
+  }
+  for (auto& output_val : output_compats_) {
+    if (outputs_map.find(output_val.first) == outputs_map.end()) {
+      if (!output_val.second.Optional()) {
+        VLOG(3) << "The No optional Output (" << output_val.first
+                << ") of Operator (" << op_name_ << ") not find in op_desc!";
+        return false;
+      }
+    } else {
+      if (!output_val.second(outputs_map.at(output_val.first))) {
+        VLOG(3) << "The Output (" << output_val.first << ") of Operator ("
+                << op_name_ << ") compat check failed!";
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+OpCompat& OpCompatSensiblePass::AddOpCompat(OpCompat&& op_compat) {
+  std::string name = op_compat.Name();
+  op_compat_judgers_[name].reset(new OpCompat(std::move(op_compat)));
+  return *(op_compat_judgers_[name]);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.h b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f2ea673d879b8f1ca3ddbed82b6120af5044d47
--- /dev/null
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
@@ -0,0 +1,282 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class OpCompat;
+
+class AttrCompat {
+ public:
+  AttrCompat(const std::string& attr_name, OpCompat* op_compat)
+      : optional_(false), attr_name_(attr_name), op_compat_(op_compat) {}
+
+  // @{ String-related methods
+  //! Assert the attribute is an string in the `candidates` domain.
+  AttrCompat& IsStringIn(const std::set<std::string>& candidates);
+  //! Assert the attribute is a string and match a custom judging function.
+  AttrCompat& IsStringMatch(
+      const std::function<bool(const std::string&)>& func);
+  // @}
+
+  //! Assert the attribute is an integer in the `candidates` domain.
+  AttrCompat& IsIntIn(const std::set<int>& candidates);
+
+  // @{ Number-releated methods
+  //! Assert the attribute is a number and > `v`.
+  template <typename T>
+  AttrCompat& IsNumGT(T v);
+  //! Assert the attribute is a number and >= `v`.
+  template <typename T>
+  AttrCompat& IsNumGE(T v);
+  //! Assert the attribute is a number and < `v`.
+  template <typename T>
+  AttrCompat& IsNumLT(T v);
+  //! Assert the attribute is a number and <= `v`.
+  template <typename T>
+  AttrCompat& IsNumLE(T v);
+  //! Assert the attribute is a number and == `v`.
+  template <typename T>
+  AttrCompat& IsNumEQ(T v);
+  //! Assert the attribute is a number and matches a customized judging
+  //! function.
+  template <typename T>
+  AttrCompat& IsNumMatch(bool (*func)(T));
+  // @}
+
+  //! Assert the attribute is a boolean value equals `v`.
+  AttrCompat& IsBoolEQ(bool v);
+
+  //! Tell whether this attribute is left as default value.
+  AttrCompat& IsLeftDefault();
+
+  AttrCompat& IsOptional();
+
+  //! Jump back to retrieve OpCompat instance.
+  OpCompat& End() { return *op_compat_; }
+
+  bool operator()(const OpDesc& op_desc);
+
+ private:
+  bool optional_;
+  std::string attr_name_;
+  OpCompat* op_compat_;
+  std::vector<std::function<bool(const Attribute&)>> conditions_;
+};
+
+class InputOrOutputCompat {
+ public:
+  InputOrOutputCompat(const std::string& name, OpCompat* op_compat)
+      : optional_(false), name_(name), op_compat_(op_compat) {}
+
+  InputOrOutputCompat& IsTensor();
+  InputOrOutputCompat& IsOptional();
+  bool Optional() const { return optional_; }
+  bool operator()(const std::vector<std::string>& input) const;
+
+  //! Jump back to retrieve OpCompat instance.
+  OpCompat& End() { return *op_compat_; }
+
+ private:
+  bool optional_;
+  std::string name_;
+  OpCompat* op_compat_;
+  std::vector<std::function<bool(const std::vector<std::string>&)>> conditions_;
+};
+
+/**
+ * OpCompat is a helper class to help define the compatible Op definition.
+ *
+ * Usage:
+ *   OpCompat compat("FC");
+ *   compat.AddAttr("in_num_col_dims").IsNumLE(1).End()
+ *         .AddAttr("activation_type").IsStringIn({"tanh", "sigmoid"}).End()
+ *         .AddInput("Input").IsTensor().End()
+ *         .AddInput("W").IsTensor().End()
+ *         .AddInput("Bias").IsTensor().IsOptional().End()
+ *         .AddOutput("Out").IsTensor().End()
+ *
+ * All the inference-aware Op defition is as above, all the other attributes not
+ * contained in the definition should be set default value or it would be judged
+ * incompatible.
+ */
+class OpCompat {
+ public:
+  explicit OpCompat(const std::string& op_name) : op_name_(op_name) {}
+  explicit OpCompat(std::string&& op_name) : op_name_(std::move(op_name)) {}
+  explicit OpCompat(const OpCompat&) = default;
+  explicit OpCompat(OpCompat&&) = default;
+
+  AttrCompat& AddAttr(const std::string& attr_name);
+  InputOrOutputCompat& AddInput(const std::string& name);
+  InputOrOutputCompat& AddOutput(const std::string& name);
+
+  //! Judge whether an OpDesc match the defined Op compatibility.
+  bool Judge(const OpDesc& op_desc);
+  const std::string& Name() const { return op_name_; }
+
+ private:
+  std::string op_name_;
+  std::unordered_map<std::string, AttrCompat> attr_compats_;
+  std::unordered_map<std::string, InputOrOutputCompat> input_compats_;
+  std::unordered_map<std::string, InputOrOutputCompat> output_compats_;
+};
+
+/**
+ * OpCompatSensiblePass is a base class for all the passes thouse is sensitive
+ * to Op update.
+ * There are two methods to help tell the compability of an Op
+ *   bool IsCompat(const GraphPatternDetector::subgraph_t& subgraph, Graph* g);
+ *   bool IsCompat(const OpDesc& op_desc);
+ *
+ * One can register the related Op compabilities using
+ *   void AddOpCompat(OpCompat&& judger);
+ *
+ * Most of the Passes are used for fusing ops, so we define a method for such
+ * scenerios.
+ *   void AccessSubgraph(const GraphPatternDetector::subgraph_t& subgraph,
+ Graph* g);
+ * It will check the Op compatibility automatically.
+ * For other scenirios, one should call `IsCompat` by himself.
+ *
+ * A FC fuse pass example:
+ * class FcFusePass : public OpCompatSensiblePass {
+ *  public:
+ *   FcFusePass() {
+ *     // define Mul op compatiblity.
+ *     AddOpCompat(OpCompat("Mul"))
+ *        .AddInput("Input").IsTensor().End()
+ *        .AddAttr("in_num_col_dims").IsNumGE(1);
+ *     AddOpCompat(OpCompat("Add")). ...;
+ *     // There are multiple activation implemention.
+ *     AddOpCompat(OpCompat("Tanh")). ...;
+ *     AddOpCompat(OpCompat("Sigmoid")). ...;
+ *   }
+ *
+ *   // override the subgraph access method
+ *   virtual bool AccessSubgraphImpl(
+ *   const GraphPatternDetector::subgraph_t& subgraph,
+ *         Graph* g) override { ... }
+ *
+ *   // Call the AccessSubgraph method in main procedure of this Pass.
+ * };
+ */
+class OpCompatSensiblePass : public Pass {
+ protected:
+  /**
+   * Developer should push the compatibility `teller` for each kind of Op in the
+   * subgraph.
+   * NOTE One should add all the related op compatiblity in the construct so
+   * that all the following methods are valid.
+   */
+  OpCompat& AddOpCompat(OpCompat&& op_compat);
+
+  //! Tell the Op compability of a subgraph.
+  bool IsCompat(const GraphPatternDetector::subgraph_t& subgraph,
+                Graph* g) const {
+    CHECK(!op_compat_judgers_.empty())
+        << "At least one OpCompat instance should be added in the "
+           "OpCompatSensiblePass.";
+    // Check the all the ops in the subgraph are contained in the
+    // op_compat.
+    for (auto& node_pair : subgraph) {
+      if (!node_pair.second->IsOp()) continue;
+      auto op_type = node_pair.second->Op()->Type();
+      if (!op_compat_judgers_.count(op_type)) {
+        return false;
+      }
+      auto& judger = *op_compat_judgers_.at(op_type);
+      if (!judger.Judge(*(node_pair.second->Op()))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  //! Tell the op compatibility of a single Op.
+  bool IsCompat(const OpDesc& op_desc) const {
+    if (!op_compat_judgers_.count(op_desc.Type())) return false;
+    return op_compat_judgers_.at(op_desc.Type())->Judge(op_desc);
+  }
+
+ private:
+  std::map<std::string, std::unique_ptr<OpCompat>> op_compat_judgers_;
+};
+
+template <typename T>
+AttrCompat& AttrCompat::IsNumGT(T v) {
+  conditions_.emplace_back([v](const Attribute& attr) -> bool {
+    T value = BOOST_GET_CONST(T, attr);
+    return value > v;
+  });
+  return *this;
+}
+
+template <typename T>
+AttrCompat& AttrCompat::IsNumGE(T v) {
+  conditions_.emplace_back([v](const Attribute& attr) -> bool {
+    T value = BOOST_GET_CONST(T, attr);
+    return value >= v;
+  });
+  return *this;
+}
+
+template <typename T>
+AttrCompat& AttrCompat::IsNumLT(T v) {
+  conditions_.emplace_back([v](const Attribute& attr) -> bool {
+    T value = BOOST_GET_CONST(T, attr);
+    return value < v;
+  });
+  return *this;
+}
+
+template <typename T>
+AttrCompat& AttrCompat::IsNumLE(T v) {
+  conditions_.emplace_back([v](const Attribute& attr) -> bool {
+    T value = BOOST_GET_CONST(T, attr);
+    return value <= v;
+  });
+  return *this;
+}
+
+template <typename T>
+AttrCompat& AttrCompat::IsNumEQ(T v) {
+  conditions_.emplace_back([v](const Attribute& attr) -> bool {
+    T value = BOOST_GET_CONST(T, attr);
+    return value == v;
+  });
+  return *this;
+}
+
+template <typename T>
+AttrCompat& AttrCompat::IsNumMatch(bool (*func)(T)) {
+  conditions_.emplace_back([func](const Attribute& attr) -> bool {
+    T value = BOOST_GET_CONST(T, attr);
+    return func(value);
+  });
+  return *this;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0878e4d9890d35bc4ecdf276880b43e9c5f4f416
--- /dev/null
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
@@ -0,0 +1,186 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+TEST(OpCompatSensiblePass, compatOp) {
+  auto lambda = [](const std::string& str) { return str == "tanh"; };
+  OpCompat compat("fc");
+  compat.AddAttr("in_num_col_dims")
+      .IsIntIn({1, 2})
+      .IsNumLE(1)
+      .IsLeftDefault()
+      .End()
+      .AddAttr("activation_type")
+      .IsStringIn({"tanh", "sigmoid"})
+      .IsStringMatch(lambda)
+      .End()
+      .AddAttr("test_attr")
+      .IsBoolEQ(true)
+      .End()
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("W")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("Test")
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+
+  OpDesc fc_op;
+
+  std::unordered_map<std::string, Attribute> attr_map;
+  attr_map["in_num_col_dims"] = 1;
+  attr_map["activation_type"] = std::string("tanh");
+  attr_map["test_attr"] = true;
+
+  fc_op.SetAttrMap(attr_map);
+
+  fc_op.SetInput("Input", std::vector<std::string>{"test_input"});
+  fc_op.SetInput("W", std::vector<std::string>{"test_input_0"});
+  fc_op.SetInput("Bias", std::vector<std::string>{"test_input_1"});
+  fc_op.SetOutput("Out", std::vector<std::string>{"test_output"});
+
+  EXPECT_STREQ(compat.Name().c_str(), "fc");
+  EXPECT_FALSE(compat.Judge(fc_op));
+}
+
+TEST(OpCompatSensiblePass, compatOpAttribute) {
+  OpCompat compat("fc");
+
+  OpDesc fc_op;
+
+  std::unordered_map<std::string, Attribute> attr_map;
+  attr_map["in_num_col_dims"] = 1;
+  fc_op.SetAttrMap(attr_map);
+
+  OpInfo info;
+  info.checker_ = new OpAttrChecker();
+  OpInfoMap::Instance().Insert("fc", info);
+
+  EXPECT_FALSE(compat.Judge(fc_op));
+
+  info.checker_->AddAttrChecker<int>("in_num_col_dims").SetDefault(1);
+
+  EXPECT_TRUE(compat.Judge(fc_op));
+  delete info.checker_;
+}
+
+TEST(OpCompatSensiblePass, compatOpAttributeOptional) {
+  OpCompat compat("fc");
+  compat.AddAttr("activation_type")
+      .IsOptional()
+      .IsStringIn({"tanh", "sigmoid"});
+  OpDesc fc_op;
+  EXPECT_TRUE(compat.Judge(fc_op));
+}
+
+TEST(OpCompatSensiblePass, compatOpInput) {
+  OpCompat compat("fc");
+
+  OpDesc fc_op;
+  fc_op.SetInput("Input", std::vector<std::string>{"test_input"});
+
+  EXPECT_FALSE(compat.Judge(fc_op));
+
+  compat.AddInput("Input").IsTensor().End().AddInput("Bias").IsTensor().End();
+  EXPECT_FALSE(compat.Judge(fc_op));
+
+  fc_op.SetInput("Bias", std::vector<std::string>{"test_input", ""});
+  EXPECT_FALSE(compat.Judge(fc_op));
+}
+
+TEST(OpCompatSensiblePass, compatOutput) {
+  OpCompat compat("fc");
+
+  OpDesc fc_op;
+  fc_op.SetOutput("Output", std::vector<std::string>{"test_output"});
+
+  EXPECT_FALSE(compat.Judge(fc_op));
+
+  compat.AddOutput("Output")
+      .IsTensor()
+      .End()
+      .AddOutput("Output_2")
+      .IsTensor()
+      .End();
+  EXPECT_FALSE(compat.Judge(fc_op));
+
+  fc_op.SetOutput("Output_2", std::vector<std::string>{"test_output", ""});
+  EXPECT_FALSE(compat.Judge(fc_op));
+}
+
+class OpCompatSensiblePassTest : public OpCompatSensiblePass {
+ public:
+  OpCompatSensiblePassTest();
+  bool TestIsCompat(const OpDesc& op_desc) { return IsCompat(op_desc); }
+};
+
+OpCompatSensiblePassTest::OpCompatSensiblePassTest() {
+  AddOpCompat(OpCompat("fc"))
+      .AddAttr("in_num_col_dims")
+      .IsNumLE(1)
+      .End()
+      .AddAttr("activation_type")
+      .IsStringIn({"tanh", "sigmoid"})
+      .End()
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("W")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor();
+}
+
+TEST(OpCompatSensiblePass, IsCompat) {
+  OpCompatSensiblePassTest test;
+  OpDesc fc_op;
+  fc_op.SetType("fc");
+  std::unordered_map<std::string, Attribute> attr_map;
+  attr_map["in_num_col_dims"] = 1;
+  attr_map["activation_type"] = std::string("tanh");
+
+  fc_op.SetAttrMap(attr_map);
+  fc_op.SetInput("Input", std::vector<std::string>{"test_input"});
+  fc_op.SetInput("W", std::vector<std::string>{"test_input_0"});
+  fc_op.SetInput("Bias", std::vector<std::string>{"test_input_1"});
+  fc_op.SetOutput("Out", std::vector<std::string>{"test_output"});
+
+  EXPECT_TRUE(test.TestIsCompat(fc_op));
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index 7afa76c3fbd23a395e6769d83e939e0d36424471..c0ccc196348a5761ea4dedf1aab5ce8754eb74b5 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -112,6 +112,8 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
 #ifdef PADDLE_WITH_HETERPS
     workers_[i]->SetPlace(places_[i]);
     workers_[i]->SetReaderPlace(places_[i]);
+    workers_[i]->SetDeviceContext(
+        platform::DeviceContextPool::Instance().Get(places_[i]));
 #else
     workers_[i]->SetPlace(place);
     workers_[i]->SetReaderPlace(place);
diff --git a/paddle/fluid/framework/op_def.proto b/paddle/fluid/framework/op_def.proto
new file mode 100644
index 0000000000000000000000000000000000000000..7c4b42b1344b8b236078de694b67e05d983ed2a9
--- /dev/null
+++ b/paddle/fluid/framework/op_def.proto
@@ -0,0 +1,43 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+
+import "framework.proto";
+package paddle.framework.proto;
+
+message OpDef {
+
+  message VarDef {
+    required string name = 1;
+
+    // For the type of input / output variables.
+    reserved 2;
+  }
+
+  message AttrDef {
+    required string name = 1;
+    required AttrType type = 2;
+  }
+
+  message Desc {
+    repeated VarDef inputs = 1;
+    repeated VarDef outputs = 2;
+    repeated AttrDef attrs = 3;
+  }
+
+  required string type = 1;
+  required Desc def = 2;
+  optional Desc extra = 3;
+}
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index e2ced316ae9c265377d9c3273287a89aa837945c..098c576e85d1922e0d96ec12f83c959dc24d6cb9 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1229,6 +1229,8 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
       // will be executed and a warning will be given at the same time.
       if (SupportGPU()) {
         expected_kernel_key.place_ = dev_ctx->GetPlace();
+      } else if (SupportNPU()) {
+        expected_kernel_key.place_ = dev_ctx->GetPlace();
       } else {
         expected_kernel_key.place_ = platform::CPUPlace();
         LOG_FIRST_N(WARNING, 1)
@@ -1300,7 +1302,11 @@ void OperatorWithKernel::TransferInplaceVarsBack(
     auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
     auto original_dims = original_tensor->dims();
     original_tensor->ShareDataWith(*transformed_tensor);
-    original_tensor->Resize(original_dims);
+    // In order to solve the problem that the output latitude of NPU reshape
+    // operator is not changed when inplace.
+    if (type_ != "reshape2" && type_ != "reshape2_grad") {
+      original_tensor->Resize(original_dims);
+    }
   }
 }
 
@@ -1550,10 +1556,10 @@ void OperatorWithKernel::ParseInputDataType(
       } else if (var->IsType<SelectedRows>()) {
         t = &(var->Get<SelectedRows>().value());
       } else if (var->IsType<LoDTensorArray>()) {
-        auto t_arr = var->Get<LoDTensorArray>();
-        for (size_t j = 0; j < t_arr.size(); j++) {
-          if (t_arr[j].IsInitialized()) {
-            t = &(t_arr[j]);
+        auto t_arr = &var->Get<LoDTensorArray>();
+        for (size_t j = 0; j < t_arr->size(); j++) {
+          if (t_arr->at(j).IsInitialized()) {
+            t = &(t_arr->at(j));
           }
         }
       }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 2128626a05c93aff6fc35a9fb974940486936f8f..7a4ad9bd1824e91fe69eee7ce629039d07f3db1d 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -155,6 +155,7 @@ class OperatorBase {
   std::string DebugString() const { return DebugStringEx(nullptr); }
 
   virtual bool SupportGPU() const { return false; }
+  virtual bool SupportNPU() const { return false; }
 
   const std::string& Type() const { return type_; }
 
@@ -491,6 +492,13 @@ class OperatorWithKernel : public OperatorBase {
                          return platform::is_gpu_place(kern_pair.first.place_);
                        });
   }
+  bool SupportNPU() const override {
+    auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
+    return std::any_of(op_kernels.begin(), op_kernels.end(),
+                       [](OpKernelMap::const_reference kern_pair) {
+                         return platform::is_npu_place(kern_pair.first.place_);
+                       });
+  }
   bool SupportsMKLDNN(proto::VarType::Type data_type) const;
 
   bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx,
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 00ff50abadd185eb6ac8907d89aaf455dd5a7f16..993b9ac52c5b561fa2c0c850845425ac19cc97bf 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -110,8 +110,22 @@ void SectionWorker::TrainFiles() {
             BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
       }
     }
+#elif defined(PADDLE_WITH_ASCEND_CL)
+    if (IsFastEagerDeletionModeEnabled()) {
+      VLOG(4) << "Use unsafe fast gc for NPU.";
+      gc.reset(new NPUUnsafeFastGarbageCollector(
+          BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Please set FLAGS_fast_eager_deletion_mode=true to use "
+          "GarbageCollector on NPU."));
+      // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
+      VLOG(4) << "Use default stream gc for NPU.";
+      gc.reset(new NPUDefaultStreamGarbageCollector(
+          BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+    }
 #endif
-  }
+  }  // max_memory_size >= 0
 
   if (schedule_mode_ == 0) {
     // F-then-B scheduler which runs Forward phase for all microbatches,
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 82c95ba2c95712d2ebe3aa80286689028febf3fe..c7d947c58039efa80d5b8336bc5db99cd89cee82 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -71,7 +71,7 @@ elseif (WIN32)
   cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
-if(WITH_TESTING)
+if(WITH_TESTING AND TEST test_api_impl)
     if(NOT APPLE)
         set_tests_properties(test_api_impl PROPERTIES TIMEOUT 120)
     endif()
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 89c8c7902bac9fd2e15a164f7e0dfd21945cf16e..1ec692d3d1df66d8c1df689d557b289fc2880b30 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -650,13 +650,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
         gflags.push_back("--cudnn_deterministic=True");
       }
 
-      if (config.thread_local_stream_enabled()) {
-        gflags.push_back("--allocator_strategy=thread_local");
-        process_level_allocator_enabled = false;
-      } else {
-        process_level_allocator_enabled = true;
-      }
-
 // TODO(wilber): jetson tx2 may fail to run the model due to insufficient memory
 // under the native_best_fit strategy. Modify the default allocation strategy to
 // auto_growth. todo, find a more appropriate way to solve the problem.
@@ -664,6 +657,15 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
       gflags.push_back("--allocator_strategy=auto_growth");
 #endif
 
+      // TODO(Shixiaowei02): Add a mandatory scheme to use the thread local
+      // allocator when multi-stream is enabled.
+      if (config.thread_local_stream_enabled()) {
+        gflags.push_back("--allocator_strategy=thread_local");
+        process_level_allocator_enabled = false;
+      } else {
+        process_level_allocator_enabled = true;
+      }
+
       if (framework::InitGflags(gflags)) {
         VLOG(3) << "The following gpu analysis configurations only take effect "
                    "for the first predictor: ";
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index f7dbfd39cd26e6af40d7536d76fd031bee5a331c..43306b79fabf60caedb6c6c54417d3b7c98ab127 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -161,8 +162,24 @@ void Tensor::CopyToCpu(T *data) {
   auto *t_data = tensor->data<T>();
   auto t_place = tensor->place();
 
+  paddle::framework::Tensor out;
+  auto mem_allocation = std::make_shared<paddle::memory::Allocation>(
+      static_cast<void *>(data), ele_num * sizeof(T),
+      paddle::platform::CPUPlace());
+  out.ResetHolder(mem_allocation);
+
   if (paddle::platform::is_cpu_place(t_place)) {
+#ifdef PADDLE_WITH_MKLDNN
+    if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN)
+      paddle::framework::innerTransDataLayoutFromMKLDNN(
+          tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls()
+                                .get_cur_paddle_data_layout(),
+          *tensor, &out, paddle::platform::CPUPlace(), true);
+    else
+      std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+#else
     std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+#endif
   } else if (place_ == PlaceType::kGPU) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     paddle::platform::DeviceContextPool &pool =
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 9244b9af0bbd6cfc392b1b940d81c04b0dd0cde9..e6a0ecf4aececcba012923f631b2dcfd8f69743d 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -52,11 +52,6 @@ class ActivationOpConverter : public OpConverter {
         engine_->GetITensor(op_desc.Input("X")[0]);
 
     auto op_pair = ops.find(op_type_);
-    if (op_pair == ops.end()) {
-      PADDLE_THROW(platform::errors::Fatal(
-          "Wrong activation op type, the trt do not support the %s act type.",
-          op_type_));
-    }
 
     nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
         engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
diff --git a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
index 813342c08483b7e9124929d3f00d8155d337e67e..eba67c3c098ca60b7608ecf6db50b46e233955a5 100644
--- a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
@@ -55,16 +55,6 @@ class AffineChannelOpConverter : public OpConverter {
     auto* bias_t = bias_v->GetMutable<framework::LoDTensor>();
     float* bias_ptr = engine_->GetWeightCPUData(bias_name, bias_t, false);
 
-    auto data_layout = framework::StringToDataLayout(
-        BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout")));
-
-    PADDLE_ENFORCE_EQ(
-        data_layout, framework::DataLayout::kNCHW,
-        platform::errors::InvalidArgument(
-            "TensorRT affine channel converter can only convert NCHW format. "
-            "Other format should be run in fluid mode. Report a bug on github "
-            "issue if you see this line."));
-
     // tensorrt scalend layer only support spatial dims >= 2,
     // so nhwc is not availabe (spatial dims == 0)
     const int channel_axis = engine_->with_dynamic_shape();
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 47f5cc97d39cdf785bdbcbc468714f3fe0357209..df2400854414c36f46393e70ec1095c347c05c6c 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -25,10 +25,6 @@ static bool CheckDims(const nvinfer1::Dims& dims_x,
     return false;
   }
   for (int i = 0; i < dims_x.nbDims; i++) {
-    // conservative judgment
-    if (dims_x.d[i] == -1 || dims_y.d[i] == -1) {
-      return false;
-    }
     if (dims_x.d[i] != dims_y.d[i]) {
       return false;
     }
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 54fc9492b7193e90245cce23538e34a4857cfe1f..9df3ec0445ad1c6c778f81a5e1489096c850c589 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -143,6 +143,19 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
 
       if (paddings.size() > 2) return false;
+// strides > 1 is only supported by trt7.0 above
+#if !IS_TRT_VERSION_GE(7000)
+      if (desc.HasAttr("strides")) {
+        const std::vector<int> strides =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides"));
+        // there is no issue if strides.size() less than 2
+        if (strides.size() > 1) {
+          for (size_t i = 0; i < strides.size(); i++) {
+            if (strides[i] > 1) return false;
+          }
+        }
+      }
+#endif
     }
 
     if (op_type == "pool2d") {
@@ -225,6 +238,20 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                 << desc.Output("Output").size() << " output.";
         return false;
       }
+
+// strides > 1 is only supported by trt7.0 above
+#if !IS_TRT_VERSION_GE(7000)
+      if (desc.HasAttr("strides")) {
+        const std::vector<int> strides =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides"));
+        // there is no issue if strides.size() less than 2
+        if (strides.size() > 1) {
+          for (size_t i = 0; i < strides.size(); i++) {
+            if (strides[i] > 1) return false;
+          }
+        }
+      }
+#endif
     }
 
     if (op_type == "matmul") {
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index f74cd671d6dca0cd52bb595f6ee1370b464d9e30..a5f075b8dc68c26a034a667f5bdb2a26c224c24c 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -176,7 +176,7 @@ if(NOT APPLE AND WITH_MKLML)
     inference_analysis_api_test(test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc)
     inference_analysis_api_test(test_analyzer_seq_pool1_fuse_statis ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_statis_tester.cc)
     inference_analysis_api_test(test_analyzer_seq_pool1_profile ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_profile_tester.cc)
-    if(NOT WIN32)
+    if(NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
         set_tests_properties(test_analyzer_seq_pool1_compare_determine PROPERTIES TIMEOUT 120)
         set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120)
         set_tests_properties(test_analyzer_seq_pool1_fuse_compare_zero_copy PROPERTIES TIMEOUT 120)
@@ -242,10 +242,10 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz")
 inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
     EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
-if(NOT WIN32 AND NOT APPLE)
+if(NOT WIN32 AND NOT APPLE AND TEST test_analyzer_ernie_large)
     set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
 endif()
-if (WIN32)
+if (WIN32 AND TEST test_analyzer_ernie_large)
     set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 200)
 endif()
 
@@ -645,6 +645,10 @@ if(WITH_GPU)
         ARGS --infer_model=${RESNET50_MODEL_DIR})
 endif()
 
+if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+    return()
+endif()
+
 if(WITH_GPU AND TENSORRT_FOUND)
     set_tests_properties(trt_resnext_test PROPERTIES TIMEOUT 300)
     set_tests_properties(trt_quant_int8_yolov3_r50_test PROPERTIES TIMEOUT 300)
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index 5c431ce77dc76ae08c70cd54989f323a230d47f7..796425a132b0003ae055569c23b107bd80987f9f 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -164,9 +164,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::AbsKernel<paddle::platform::CPUDeviceContext, int>,
     ops::AbsKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::AbsKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex64>,
+                   paddle::platform::complex<float>>,
     ops::AbsKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex128>);
+                   paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     abs_grad, ops::AbsGradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -174,9 +174,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::AbsGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::AbsGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::AbsGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex64>,
+                       paddle::platform::complex<float>>,
     ops::AbsGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex128>);
+                       paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     abs_grad_grad,
@@ -187,6 +187,6 @@ REGISTER_OP_CPU_KERNEL(
     ops::AbsDoubleGradKernel<paddle::platform::CPUDeviceContext,
                              paddle::platform::float16>,
     ops::AbsDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex64>,
+                             paddle::platform::complex<float>>,
     ops::AbsDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex128>);
+                             paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu
index 97409e6cb1b17b8fc109e30dc78720b8d573f042..d03de7a45628a4cd4045e6fbc2965060b3486cfe 100644
--- a/paddle/fluid/operators/abs_op.cu
+++ b/paddle/fluid/operators/abs_op.cu
@@ -52,8 +52,9 @@ class AbsKernel<platform::CUDADeviceContext, T>
     std::vector<const framework::Tensor*> ins = {x};
     std::vector<framework::Tensor*> outs = {out};
     auto functor = CudaAbsFunctor<T>();
-    LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T, math::Real<T>>(
-        dev_ctx, ins, &outs, functor);
+    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T,
+                                        math::Real<T>>(dev_ctx, ins, &outs,
+                                                       functor);
   }
 };
 
@@ -69,8 +70,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::AbsKernel<plat::CUDADeviceContext, int>,
     ops::AbsKernel<plat::CUDADeviceContext, int64_t>,
     ops::AbsKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::AbsKernel<plat::CUDADeviceContext, plat::complex64>,
-    ops::AbsKernel<plat::CUDADeviceContext, plat::complex128>);
+    ops::AbsKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::AbsKernel<plat::CUDADeviceContext, plat::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     abs_grad, ops::AbsGradKernel<plat::CUDADeviceContext, float>,
@@ -78,8 +79,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::AbsGradKernel<plat::CUDADeviceContext, int>,
     ops::AbsGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::AbsGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex64>,
-    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex128>);
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     abs_grad_grad, ops::AbsDoubleGradKernel<plat::CUDADeviceContext, float>,
@@ -87,5 +88,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::AbsDoubleGradKernel<plat::CUDADeviceContext, int>,
     ops::AbsDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex64>,
-    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex128>);
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex<double>>);
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 618f17031b1ef3b4b96ea72b05f9f63edd01c794..87e65e8817798199c907f88692887a21da58673c 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -13,6 +13,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 
 namespace paddle {
@@ -1315,8 +1316,8 @@ class ActivationCudaKernel
     for (auto& attr : attrs) {
       *attr.second = ctx.Attr<float>(attr.first);
     }
-    LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(dev_ctx, ins,
-                                                               &outs, functor);
+    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
+        dev_ctx, ins, &outs, functor);
   }
 };
 
@@ -1345,16 +1346,16 @@ class ActivationGradCudaKernel
     if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
       // Only need forward output Out
       ins.push_back(out);
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
           dev_ctx, ins, &outs, functor);
     } else if (static_cast<int>(Functor::FwdDeps()) ==
                static_cast<int>(kDepX)) {
       // Only need forward input X
       ins.push_back(x);
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
           dev_ctx, ins, &outs, functor);
     } else {
-      LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
+      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
           dev_ctx, ins, &outs, functor);
     }
   }
@@ -1437,9 +1438,9 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================    relu register  ============================ */
+#ifdef PADDLE_WITH_HIP
 REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor,
                                 CudaReluGradFunctor);
-
 REGISTER_OP_CUDA_KERNEL(
     relu_grad_grad,
     ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
@@ -1448,6 +1449,36 @@ REGISTER_OP_CUDA_KERNEL(
                                     ops::ReluGradGradFunctor<double>>,
     ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
                                     ops::ReluGradGradFunctor<plat::float16>>);
+#else
+REGISTER_OP_CUDA_KERNEL(
+    relu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
+                                    ops::CudaReluFunctor<float>>,
+    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
+                              ops::CudaReluFunctor<double>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaReluFunctor<plat::float16>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaReluFunctor<plat::bfloat16>>);
+REGISTER_OP_CUDA_KERNEL(
+    relu_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                             ops::CudaReluGradFunctor<float>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaReluGradFunctor<double>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaReluGradFunctor<plat::float16>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaReluGradFunctor<plat::bfloat16>>);
+REGISTER_OP_CUDA_KERNEL(
+    relu_grad_grad,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::ReluGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::ReluGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
+                                    ops::ReluGradGradFunctor<plat::float16>>,
+    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
+                                    ops::ReluGradGradFunctor<plat::bfloat16>>);
+#endif
 /* ========================================================================== */
 
 /* ===========================    tanh register  ============================ */
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 40f4b969ec060d8453d176db67a6eb20933c6b3e..952e9ca329f102566d14cbf9180001e4ae5aef35 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -27,6 +27,9 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "The output tensor of cast op");
     AddAttr<int>("out_dtype", "output data type");
     AddAttr<int>("in_dtype", "input data type");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Cast Operator.
 
@@ -50,6 +53,7 @@ class CastOpGradMaker : public framework::SingleGradOpMaker<T> {
     grad->SetOutput("Out", this->InputGrad("X"));
     grad->SetAttr("out_dtype", this->GetAttr("in_dtype"));
     grad->SetAttr("in_dtype", this->GetAttr("out_dtype"));
+    grad->SetAttr("use_mkldnn", this->GetAttr("use_mkldnn"));
   }
 };
 
@@ -77,6 +81,28 @@ class CastOp : public framework::OperatorWithKernel {
     if (platform::is_cuda_pinned_place(tensor_place)) {
       return framework::OpKernelType(tensor->type(), ctx.device_context());
     }
+
+#ifdef PADDLE_WITH_MKLDNN
+    int in_dtype = ctx.Attr<int>("in_dtype");
+    int out_dtype = ctx.Attr<int>("out_dtype");
+
+    auto MKLDNNSupportsCast = [&]() -> bool {
+      int dtype_fp32 = static_cast<int>(framework::proto::VarType::FP32);
+      int dtype_bf16 = static_cast<int>(framework::proto::VarType::BF16);
+
+      if ((in_dtype != dtype_fp32 && in_dtype != dtype_bf16) ||
+          (out_dtype != dtype_fp32 && out_dtype != dtype_bf16))
+        return false;
+
+      return true;
+    };
+
+    if (this->CanMKLDNNBeUsed(ctx, tensor->type()) && MKLDNNSupportsCast()) {
+      return framework::OpKernelType(tensor->type(), ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
     return framework::OpKernelType(tensor->type(), tensor_place);
   }
 };
@@ -90,13 +116,11 @@ REGISTER_OPERATOR(cast, ops::CastOp,
                   ops::CastOpGradMaker<paddle::framework::OpDesc>,
                   ops::CastOpGradMaker<paddle::imperative::OpBase>,
                   ops::CastOpProtoMaker);
-REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
-                       ops::CastOpKernel<CPU, double>,
-                       ops::CastOpKernel<CPU, int>,
-                       ops::CastOpKernel<CPU, int64_t>,
-                       ops::CastOpKernel<CPU, bool>,
-                       ops::CastOpKernel<CPU, uint8_t>,
-                       ops::CastOpKernel<CPU, paddle::platform::float16>,
-                       ops::CastOpKernel<CPU, paddle::platform::bfloat16>,
-                       ops::CastOpKernel<CPU, paddle::platform::complex64>,
-                       ops::CastOpKernel<CPU, paddle::platform::complex128>);
+REGISTER_OP_CPU_KERNEL(
+    cast, ops::CastOpKernel<CPU, float>, ops::CastOpKernel<CPU, double>,
+    ops::CastOpKernel<CPU, int>, ops::CastOpKernel<CPU, int64_t>,
+    ops::CastOpKernel<CPU, bool>, ops::CastOpKernel<CPU, uint8_t>,
+    ops::CastOpKernel<CPU, paddle::platform::float16>,
+    ops::CastOpKernel<CPU, paddle::platform::bfloat16>,
+    ops::CastOpKernel<CPU, paddle::platform::complex<float>>,
+    ops::CastOpKernel<CPU, paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index 13759633d0168a4d38796a88fe8db215cfcfe380..1ac110b3cafd6bfd9da29daaebb65df570a02cb0 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -95,6 +95,7 @@ struct CastOpFunctor<platform::CUDADeviceContext, InT> {
 
 namespace ops = paddle::operators;
 
+#ifdef PADDLE_WITH_HIP
 REGISTER_OP_CUDA_KERNEL(
     cast, ops::CastOpKernel<paddle::platform::CUDADeviceContext, float>,
     ops::CastOpKernel<paddle::platform::CUDADeviceContext, double>,
@@ -105,6 +106,23 @@ REGISTER_OP_CUDA_KERNEL(
     ops::CastOpKernel<paddle::platform::CUDADeviceContext,
                       paddle::platform::float16>,
     ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex64>,
+                      paddle::platform::complex<float>>,
     ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex128>);
+                      paddle::platform::complex<double>>);
+#else
+REGISTER_OP_CUDA_KERNEL(
+    cast, ops::CastOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::bfloat16>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::complex<float>>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::complex<double>>);
+#endif
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
index 69f1f4681a33d68d9a4d0efa09bd33d01834cff6..52a23c50c0e115536c87e479ff1763c8d440d550 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -27,10 +27,11 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    auto x = ctx.Output<framework::LoDTensor>("Out");
-    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
-    int numel = x->numel();
-    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(out->dims(), ctx.GetPlace());
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(out->data<T>()));
+    int numel = out->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(out->type());
 
     int ring_id = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
@@ -54,8 +55,10 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
     int root = peer;
 
     VLOG(3) << "begin hccl recv, parameter is: "
-            << "root " << root << ", comm: " << comm->comm()
-            << ", stream: " << stream;
+            << "ring_id:" << ring_id << ", nranks:" << nranks
+            << ", peer:" << peer << ", numel:" << numel << ", ptr:" << ptr
+            << ", dtype:" << dtype << ", root:" << root
+            << ", comm: " << comm->comm() << ", stream: " << stream;
 
     PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
         ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
diff --git a/paddle/fluid/operators/compat/while.pbtxt b/paddle/fluid/operators/compat/while.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34435e1d9e5ff383dd1f7fca82ee10b5428b4acd
--- /dev/null
+++ b/paddle/fluid/operators/compat/while.pbtxt
@@ -0,0 +1,49 @@
+type: "while"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Condition"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "StepScopes"
+  }
+  attrs {
+    name: "sub_block"
+    type: BLOCK
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "skip_eager_deletion_vars"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index bbc42d97146f24e69d2f2337967e129af013fb6c..68a52a79e4ce33311780fdf1993397b717a718b2 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -233,7 +233,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::ConcatKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ConcatKernel<paddle::platform::CPUDeviceContext,
                       paddle::platform::float16>,
-    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int>);
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, uint8_t>);
 REGISTER_OP_CPU_KERNEL(
     concat_grad,
     ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
@@ -242,4 +243,5 @@ REGISTER_OP_CPU_KERNEL(
     ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ConcatGradKernel<paddle::platform::CPUDeviceContext,
                           paddle::platform::float16>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int>);
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, uint8_t>);
diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc
index 8c30703f2576b35deb419238de08c5f2fa7b42d2..8732556acb9fdee6e6d83fb34f1bcadf7d8b4bb6 100644
--- a/paddle/fluid/operators/concat_op.cu.cc
+++ b/paddle/fluid/operators/concat_op.cu.cc
@@ -23,7 +23,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ConcatKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::ConcatKernel<paddle::platform::CUDADeviceContext, plat::float16>,
     ops::ConcatKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int>);
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, uint8_t>);
 REGISTER_OP_CUDA_KERNEL(
     concat_grad,
     ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, double>,
@@ -31,4 +32,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
     ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int>);
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, uint8_t>);
diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc
index 3afe4f1e3d1027ce37404544dcd0929cc41cb6a3..4d801bc003ea9ac417ff66deda8359f2921e01f6 100644
--- a/paddle/fluid/operators/conj_op.cc
+++ b/paddle/fluid/operators/conj_op.cc
@@ -78,9 +78,9 @@ REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker,
 
 REGISTER_OP_CPU_KERNEL(
     conj, ops::ConjKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::complex64>,
+                          paddle::platform::complex<float>>,
     ops::ConjKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex128>,
+                    paddle::platform::complex<double>>,
     ops::ConjKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ConjKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ConjKernel<paddle::platform::CPUDeviceContext, int>,
diff --git a/paddle/fluid/operators/conj_op.cu b/paddle/fluid/operators/conj_op.cu
index 601caeb50558876b972014813ca6dc247aecfeba..d04024d70a8ea66128010d39c9eb1233d28caf03 100644
--- a/paddle/fluid/operators/conj_op.cu
+++ b/paddle/fluid/operators/conj_op.cu
@@ -13,15 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/conj_op.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     conj, ops::ConjKernel<paddle::platform::CUDADeviceContext,
-                          paddle::platform::complex64>,
+                          paddle::platform::complex<float>>,
     ops::ConjKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex128>,
+                    paddle::platform::complex<double>>,
     ops::ConjKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ConjKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ConjKernel<paddle::platform::CUDADeviceContext, int>,
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index bf047de86fc21a4d5d9e9ff8f20c9a1982eb25af..a03e4165755dde3211425b028b474896249237f7 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -131,18 +131,18 @@ class CompareOp : public framework::OperatorWithKernel {
 
 REGISTER_COMPARE_OP(less_than, "Out = X < Y");
 REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor,
-                        paddle::operators::GreaterEqualFunctor);
+                        paddle::operators::GreaterThanFunctor);
 REGISTER_COMPARE_OP(less_equal, "Out = X <= Y");
 REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor,
-                        paddle::operators::GreaterThanFunctor);
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_COMPARE_OP(greater_than, "Out = X > Y");
 REGISTER_COMPARE_KERNEL(greater_than, CPU,
                         paddle::operators::GreaterThanFunctor,
-                        paddle::operators::LessEqualFunctor);
+                        paddle::operators::LessThanFunctor);
 REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y");
 REGISTER_COMPARE_KERNEL(greater_equal, CPU,
                         paddle::operators::GreaterEqualFunctor,
-                        paddle::operators::LessThanFunctor);
+                        paddle::operators::LessEqualFunctor);
 REGISTER_COMPARE_OP(equal, "Out = X == Y");
 REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor,
                         paddle::operators::EqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu
index 3ca700e16e6e7bcf4136ca68dd895593a63824ec..a60201f9d07d69897ec81ced54964a50a9d84795 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_op.cu
@@ -15,15 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/compare_op.h"
 
 REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor,
-                        paddle::operators::GreaterEqualFunctor);
-REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor,
                         paddle::operators::GreaterThanFunctor);
+REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_COMPARE_KERNEL(greater_than, CUDA,
                         paddle::operators::GreaterThanFunctor,
-                        paddle::operators::LessEqualFunctor);
+                        paddle::operators::LessThanFunctor);
 REGISTER_COMPARE_KERNEL(greater_equal, CUDA,
                         paddle::operators::GreaterEqualFunctor,
-                        paddle::operators::LessThanFunctor);
+                        paddle::operators::LessEqualFunctor);
 REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor,
                         paddle::operators::EqualFunctor);
 REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor,
diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc
index 26f12e8f9e3bfa088dfd7e7532dc1e99a5146a89..31acd9718115c78568326532e922aad543164732 100644
--- a/paddle/fluid/operators/dot_op.cc
+++ b/paddle/fluid/operators/dot_op.cc
@@ -33,7 +33,7 @@ class DotOp : public framework::OperatorWithKernel {
                           "Output(Out) of DotOp should not be null."));
 
     auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = (size_t)x_dims.size();
+    auto x_rank = static_cast<size_t>(x_dims.size());
     PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank,
                       platform::errors::PreconditionNotMet(
                           "ShapeError: The dimensions of input tensor X (%s) "
@@ -154,15 +154,15 @@ REGISTER_OP_CPU_KERNEL(
     ops::DotKernel<paddle::platform::CPUDeviceContext, int>,
     ops::DotKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::DotKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex64>,
+                   paddle::platform::complex<float>>,
     ops::DotKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex128>);
+                   paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     dot_grad, ops::DotGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::DotGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::DotGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::DotGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::DotGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex64>,
+                       paddle::platform::complex<float>>,
     ops::DotGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex128>);
+                       paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/dot_op.cu b/paddle/fluid/operators/dot_op.cu
index 2d259ba1fbc9b4c495eb696e899ad94bb3b5e5be..49f27e1ffb12888e2361e6a504c85b02d84d6480 100644
--- a/paddle/fluid/operators/dot_op.cu
+++ b/paddle/fluid/operators/dot_op.cu
@@ -22,12 +22,14 @@ REGISTER_OP_CUDA_KERNEL(
     ops::DotKernel<plat::CUDADeviceContext, double>,
     ops::DotKernel<plat::CUDADeviceContext, int>,
     ops::DotKernel<plat::CUDADeviceContext, int64_t>,
-    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex64>,
-    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex128>);
-REGISTER_OP_CUDA_KERNEL(
-    dot_grad, ops::DotGradKernel<plat::CUDADeviceContext, float>,
-    ops::DotGradKernel<plat::CUDADeviceContext, double>,
-    ops::DotGradKernel<plat::CUDADeviceContext, int>,
-    ops::DotGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::DotGradKernel<plat::CUDADeviceContext, paddle::platform::complex64>,
-    ops::DotGradKernel<plat::CUDADeviceContext, paddle::platform::complex128>);
+    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex<float>>,
+    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex<double>>);
+REGISTER_OP_CUDA_KERNEL(dot_grad,
+                        ops::DotGradKernel<plat::CUDADeviceContext, float>,
+                        ops::DotGradKernel<plat::CUDADeviceContext, double>,
+                        ops::DotGradKernel<plat::CUDADeviceContext, int>,
+                        ops::DotGradKernel<plat::CUDADeviceContext, int64_t>,
+                        ops::DotGradKernel<plat::CUDADeviceContext,
+                                           paddle::platform::complex<float>>,
+                        ops::DotGradKernel<plat::CUDADeviceContext,
+                                           paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index b551629169deed66a1a79636287569995726c4be..67e2e3a1e96772c7508724c1cb21cf670bb84e31 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct complex128;
-struct complex64;
+template <typename T>
+struct complex;
 }  // namespace platform
 }  // namespace paddle
 
@@ -135,9 +135,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_add_grad,
     ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -145,9 +145,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex64>,
+                                  paddle::platform::complex<float>>,
     ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex128>);
+                                  paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_add_grad_grad,
     ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
@@ -159,9 +159,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         int64_t>,
     ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
     ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
 
 // A specialization elementwise_add operator, used in gradient accumulation with
 // inplace addto.
@@ -178,9 +178,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(elementwise_add)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index dc9c18ba038861b763cb52863ddae8ac69db5022..37e5fa5a20657748804442e549baa999169836d2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -39,15 +38,24 @@ struct CudaAddFunctor {
 };
 
 template <typename T>
-struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
+class ElementwiseAddKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* z = ctx.Output<framework::LoDTensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    axis = axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis;
+
     std::vector<const framework::Tensor*> ins = {x, y};
     std::vector<framework::Tensor*> outs = {z};
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
     LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        ctx.template device_context<platform::CUDADeviceContext>(), ins, &outs,
-        CudaAddFunctor<T>());
+        cuda_ctx, ins, &outs, axis, CudaAddFunctor<T>());
   }
 };
 
@@ -132,8 +140,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex64>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex128>);
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_add_grad,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
@@ -141,8 +149,10 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::complex64>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::complex128>);
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext,
+                                  plat::complex<float>>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext,
+                                  plat::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_add_grad_grad,
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, float>,
@@ -151,9 +161,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex64>,
+                                        plat::complex<float>>,
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex128>);
+                                        plat::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
@@ -161,5 +171,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex64>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex128>);
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index abea9da9423553e177581a30c02fe73dc50369c6..ec7d036a1a1e0295ec496960069335fb33d3d003 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -20,11 +20,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifdef __NVCC__
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include "cub/cub.cuh"
+
 #endif
 #ifdef __HIPCC__
 #include <hip/hip_fp16.h>
@@ -38,9 +40,10 @@ namespace paddle {
 namespace operators {
 
 template <typename DeviceContext, typename T>
-void default_elementwise_add(const framework::ExecutionContext &ctx,
-                             const framework::Tensor *x,
-                             const framework::Tensor *y, framework::Tensor *z) {
+void LaunchBroadcastElementwiseCpuKernel(const framework::ExecutionContext &ctx,
+                                         const framework::Tensor *x,
+                                         const framework::Tensor *y,
+                                         framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
   auto x_dims = x->dims();
   auto y_dims = y->dims();
@@ -68,12 +71,13 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto *y = ctx.Input<framework::LoDTensor>("Y");
     auto *z = ctx.Output<framework::LoDTensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
-    auto dims_equal = x->dims() == y->dims();
-    if (dims_equal) {
-      SameDimsElemwiseAdd<DeviceContext, T> same_dims_add;
-      same_dims_add(ctx, x, y, z);
+    if (x->dims() == y->dims()) {
+      SameDimsElemwiseAdd<platform::CPUDeviceContext, T>
+          LaunchElementwiseCpuKernel;
+      LaunchElementwiseCpuKernel(ctx, x, y, z);
     } else {
-      default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
+      LaunchBroadcastElementwiseCpuKernel<platform::CPUDeviceContext, T>(ctx, x,
+                                                                         y, z);
     }
   }
 };
@@ -459,8 +463,8 @@ class ElementwiseAddDoubleGradKernel : public framework::OpKernel<T> {
       GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
 
       ddout->mutable_data<T>(ctx.GetPlace());
-      default_elementwise_add<DeviceContext, T>(ctx, &ddx_safe, &ddy_safe,
-                                                ddout);
+      LaunchBroadcastElementwiseCpuKernel<DeviceContext, T>(ctx, &ddx_safe,
+                                                            &ddy_safe, ddout);
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
index 8d99aa2798568f507fceaf33772e85a81fd23b67..8b902acebb4c5d4a8f739c9fe0e5a6f40c31ee9f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -141,6 +141,7 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
       }
     }
 
+    const T* dz_data = dz->data<T>();
     T* dx_data = nullptr;
     T* dy_data = nullptr;
     if (dx) {
@@ -152,9 +153,9 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
 
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::XPUDeviceContext>();
-    int ret = xpu::broadcast_add_grad<T>(dev_ctx.x_context(), dx_data, dx_data,
-                                         dx_data, dz->data<T>(), dy_data,
-                                         dx_data, x_dims_vec, y_dims_vec);
+    int ret = xpu::broadcast_add_grad<T>(dev_ctx.x_context(), dz_data, dz_data,
+                                         dz_data, dz_data, dy_data, dx_data,
+                                         x_dims_vec, y_dims_vec);
     PADDLE_ENFORCE_EQ(
         ret, xpu::SUCCESS,
         platform::errors::External(
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 0252e6dfff5d755cdc9ded56df4dc77f1c542fc0..9a899ec11b4c17cadd836c5959ca7e4287e2dbd2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -135,9 +134,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_div_grad,
     ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -145,9 +144,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex64>,
+                                  paddle::platform::complex<float>>,
     ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex128>);
+                                  paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_div_grad_grad,
@@ -160,9 +159,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         int64_t>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(elementwise_div)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index 0cf9294c9de67fe4e7f2f32ff96c53586c8e860b..b10ed57af901f03688cff0129107e786628b6393 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -14,8 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -76,18 +75,21 @@ static __global__ void SimpleElemwiseDivGradCUDAKernel(const T* x, const T* y,
 }
 
 template <>
-__global__ void SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex64>(
-    const paddle::platform::complex64* x, const paddle::platform::complex64* y,
-    const paddle::platform::complex64* out,
-    const paddle::platform::complex64* dout, int64_t size,
-    paddle::platform::complex64* dx, paddle::platform::complex64* dy) {
+__global__ void
+SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex<float>>(
+    const paddle::platform::complex<float>* x,
+    const paddle::platform::complex<float>* y,
+    const paddle::platform::complex<float>* out,
+    const paddle::platform::complex<float>* dout, int64_t size,
+    paddle::platform::complex<float>* dx,
+    paddle::platform::complex<float>* dy) {
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
   while (col < size) {
-    paddle::platform::complex64 o = dout[col];
-    paddle::platform::complex64 y_conj(y[col].real, -y[col].imag);
-    paddle::platform::complex64 out_div_y_conj((out[col] / y[col]).real,
-                                               -(out[col] / y[col]).imag);
+    paddle::platform::complex<float> o = dout[col];
+    paddle::platform::complex<float> y_conj(y[col].real, -y[col].imag);
+    paddle::platform::complex<float> out_div_y_conj((out[col] / y[col]).real,
+                                                    -(out[col] / y[col]).imag);
     dx[col] = o / y_conj;
     dy[col] = -o * out_div_y_conj;
     col += blockDim.x * gridDim.x;
@@ -95,19 +97,21 @@ __global__ void SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex64>(
 }
 
 template <>
-__global__ void SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex128>(
-    const paddle::platform::complex128* x,
-    const paddle::platform::complex128* y,
-    const paddle::platform::complex128* out,
-    const paddle::platform::complex128* dout, int64_t size,
-    paddle::platform::complex128* dx, paddle::platform::complex128* dy) {
+__global__ void
+SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex<double>>(
+    const paddle::platform::complex<double>* x,
+    const paddle::platform::complex<double>* y,
+    const paddle::platform::complex<double>* out,
+    const paddle::platform::complex<double>* dout, int64_t size,
+    paddle::platform::complex<double>* dx,
+    paddle::platform::complex<double>* dy) {
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
   while (col < size) {
-    paddle::platform::complex128 o = dout[col];
-    paddle::platform::complex128 y_conj(y[col].real, -y[col].imag);
-    paddle::platform::complex128 out_div_y_conj((out[col] / y[col]).real,
-                                                -(out[col] / y[col]).imag);
+    paddle::platform::complex<double> o = dout[col];
+    paddle::platform::complex<double> y_conj(y[col].real, -y[col].imag);
+    paddle::platform::complex<double> out_div_y_conj((out[col] / y[col]).real,
+                                                     -(out[col] / y[col]).imag);
     dx[col] = o / y_conj;
     dy[col] = -o * out_div_y_conj;
     col += blockDim.x * gridDim.x;
@@ -145,9 +149,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
@@ -157,9 +161,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex64>,
+                                  paddle::platform::complex<float>>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex128>);
+                                  paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad_grad,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
@@ -173,6 +177,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                         int64_t>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index 0be8d934b17af7e367eefa2e4c5319f8cb1974f4..a0b9633acb2e5956754d07c53bcdcea7b2896c07 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -74,23 +74,13 @@ struct DivGradDX {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
 };
 
-template <>
-struct DivGradDX<paddle::platform::complex64> {
-  HOSTDEVICE paddle::platform::complex64 operator()(
-      paddle::platform::complex64 x, paddle::platform::complex64 y,
-      paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
-    paddle::platform::complex64 y_conj(y.real, -y.imag);
-    return dout / y_conj;
-  }
-};
-
-template <>
-struct DivGradDX<paddle::platform::complex128> {
-  HOSTDEVICE paddle::platform::complex128 operator()(
-      paddle::platform::complex128 x, paddle::platform::complex128 y,
-      paddle::platform::complex128 out,
-      paddle::platform::complex128 dout) const {
-    paddle::platform::complex128 y_conj(y.real, -y.imag);
+template <typename T>
+struct DivGradDX<paddle::platform::complex<T>> {
+  HOSTDEVICE paddle::platform::complex<T> operator()(
+      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
+      paddle::platform::complex<T> out,
+      paddle::platform::complex<T> dout) const {
+    paddle::platform::complex<T> y_conj(y.real, -y.imag);
     return dout / y_conj;
   }
 };
@@ -102,23 +92,13 @@ struct DivGradDY {
   }
 };
 
-template <>
-struct DivGradDY<paddle::platform::complex64> {
-  HOSTDEVICE paddle::platform::complex64 operator()(
-      paddle::platform::complex64 x, paddle::platform::complex64 y,
-      paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
-    paddle::platform::complex64 out_div_y_conj((out / y).real, -(out / y).imag);
-    return -dout * out_div_y_conj;
-  }
-};
-
-template <>
-struct DivGradDY<paddle::platform::complex128> {
-  HOSTDEVICE paddle::platform::complex128 operator()(
-      paddle::platform::complex128 x, paddle::platform::complex128 y,
-      paddle::platform::complex128 out,
-      paddle::platform::complex128 dout) const {
-    paddle::platform::complex128 out_div_y_conj((out / y).real,
+template <typename T>
+struct DivGradDY<paddle::platform::complex<T>> {
+  HOSTDEVICE paddle::platform::complex<T> operator()(
+      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
+      paddle::platform::complex<T> out,
+      paddle::platform::complex<T> dout) const {
+    paddle::platform::complex<T> out_div_y_conj((out / y).real,
                                                 -(out / y).imag);
     return -dout * out_div_y_conj;
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index 6bf296f0e0b57aaab6e16083a35eab5ec80613ef..0045f00ecc6c25ca700cb8bbdca510fc7f705b8e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -134,9 +133,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul_grad,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -144,9 +143,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex64>,
+                                  paddle::platform::complex<float>>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex128>);
+                                  paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul_grad_grad,
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
@@ -158,9 +157,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         int64_t>,
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(elementwise_mul)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index e01b5eb5fb73d9aca7de318276014f29576040a9..8fd4609c3aa8508687540d5424a9e91511a1a3b5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -76,31 +75,31 @@ static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y,
 }
 
 template <>
-__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex64>(
-    const plat::complex64* x, const plat::complex64* y,
-    const plat::complex64* out, const plat::complex64* dout, int64_t size,
-    plat::complex64* dx, plat::complex64* dy) {
+__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex<float>>(
+    const plat::complex<float>* x, const plat::complex<float>* y,
+    const plat::complex<float>* out, const plat::complex<float>* dout,
+    int64_t size, plat::complex<float>* dx, plat::complex<float>* dy) {
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
   while (col < size) {
-    plat::complex64 o = dout[col];
-    dx[col] = plat::complex64(y[col].real, -y[col].imag) * o;
-    dy[col] = plat::complex64(x[col].real, -x[col].imag) * o;
+    plat::complex<float> o = dout[col];
+    dx[col] = plat::complex<float>(y[col].real, -y[col].imag) * o;
+    dy[col] = plat::complex<float>(x[col].real, -x[col].imag) * o;
     col += blockDim.x * gridDim.x;
   }
 }
 
 template <>
-__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex128>(
-    const plat::complex128* x, const plat::complex128* y,
-    const plat::complex128* out, const plat::complex128* dout, int64_t size,
-    plat::complex128* dx, plat::complex128* dy) {
+__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex<double>>(
+    const plat::complex<double>* x, const plat::complex<double>* y,
+    const plat::complex<double>* out, const plat::complex<double>* dout,
+    int64_t size, plat::complex<double>* dx, plat::complex<double>* dy) {
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
   while (col < size) {
-    plat::complex128 o = dout[col];
-    dx[col] = plat::complex128(y[col].real, -y[col].imag) * o;
-    dy[col] = plat::complex128(x[col].real, -x[col].imag) * o;
+    plat::complex<double> o = dout[col];
+    dx[col] = plat::complex<double>(y[col].real, -y[col].imag) * o;
+    dy[col] = plat::complex<double>(x[col].real, -x[col].imag) * o;
     col += blockDim.x * gridDim.x;
   }
 }
@@ -133,8 +132,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex64>,
-    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex128>);
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
@@ -142,8 +141,10 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::complex64>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::complex128>);
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
+                                  plat::complex<float>>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
+                                  plat::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad_grad,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, float>,
@@ -152,6 +153,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex64>,
+                                        plat::complex<float>>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex128>);
+                                        plat::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 46a00268e4134a1a797954a6d61cfcf0d88f9b79..10e69491643c92d77f58c487abd122d51def82e5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -132,23 +132,13 @@ struct MulGradDX {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
 };
 
-template <>
-struct MulGradDX<paddle::platform::complex64> {
-  HOSTDEVICE paddle::platform::complex64 operator()(
-      paddle::platform::complex64 x, paddle::platform::complex64 y,
-      paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
-    paddle::platform::complex64 y_conj(y.real, -y.imag);
-    return dout * y_conj;
-  }
-};
-
-template <>
-struct MulGradDX<paddle::platform::complex128> {
-  HOSTDEVICE paddle::platform::complex128 operator()(
-      paddle::platform::complex128 x, paddle::platform::complex128 y,
-      paddle::platform::complex128 out,
-      paddle::platform::complex128 dout) const {
-    paddle::platform::complex128 y_conj(y.real, -y.imag);
+template <typename T>
+struct MulGradDX<paddle::platform::complex<T>> {
+  HOSTDEVICE paddle::platform::complex<T> operator()(
+      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
+      paddle::platform::complex<T> out,
+      paddle::platform::complex<T> dout) const {
+    paddle::platform::complex<T> y_conj(y.real, -y.imag);
     return dout * y_conj;
   }
 };
@@ -158,23 +148,13 @@ struct MulGradDY {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
 };
 
-template <>
-struct MulGradDY<paddle::platform::complex64> {
-  HOSTDEVICE paddle::platform::complex64 operator()(
-      paddle::platform::complex64 x, paddle::platform::complex64 y,
-      paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
-    paddle::platform::complex64 x_conj(x.real, -x.imag);
-    return dout * x_conj;
-  }
-};
-
-template <>
-struct MulGradDY<paddle::platform::complex128> {
-  HOSTDEVICE paddle::platform::complex128 operator()(
-      paddle::platform::complex128 x, paddle::platform::complex128 y,
-      paddle::platform::complex128 out,
-      paddle::platform::complex128 dout) const {
-    paddle::platform::complex128 x_conj(x.real, -x.imag);
+template <typename T>
+struct MulGradDY<paddle::platform::complex<T>> {
+  HOSTDEVICE paddle::platform::complex<T> operator()(
+      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
+      paddle::platform::complex<T> out,
+      paddle::platform::complex<T> dout) const {
+    paddle::platform::complex<T> x_conj(x.real, -x.imag);
     return dout * x_conj;
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..1492fc629457cd5f7ca312b452ccd79ab30f175d
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -0,0 +1,525 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.1
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
+
+namespace paddle {
+namespace operators {
+
+struct DimensionsTransform {
+  using DimVector = std::vector<int64_t>;
+  typedef void (*MergeFunctor)(bool &, std::vector<DimVector> &, DimVector &,
+                               int, int);
+  int64_t dim_size;
+  DimVector out_dims;
+  std::vector<DimVector> in_dims;
+
+ private:
+  // To compensate the lackage of input_tensors` dimension with input variable
+  // 'axis'
+  void InputDimensionsExtend(int N, int axis) {
+    for (auto &in_dim : in_dims) {
+      int64_t in_idx = 0;
+      if (in_dim.size() < dim_size) {
+        DimVector tmp_dim(dim_size, 1);
+        do {
+          if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) {
+            tmp_dim[axis] = in_dim[in_idx];
+            in_idx++;
+            axis++;
+          } else {
+            PADDLE_THROW(platform::errors::InvalidArgument(
+                "The %dth dimension of input tensor is expected to be equal "
+                "with"
+                "the %dth dimension of output tensor %d or 1, but recieved "
+                "%d.\n",
+                in_idx + 1, axis + 1, out_dims[axis], in_dim[in_idx]));
+          }
+        } while (in_idx < in_dim.size());
+        in_dim.resize(dim_size);
+        std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin());
+      } else {
+        do {
+          if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) {
+            in_idx++;
+          } else {
+            PADDLE_THROW(platform::errors::InvalidArgument(
+                "The %dth dimension of input tensor is expected to be equal "
+                "with"
+                "the %dth dimension of output tensor %d or 1, but recieved "
+                "%d.\n",
+                in_idx + 1, in_idx + 1, out_dims[in_idx], in_dim[in_idx]));
+          }
+        } while (in_idx < dim_size);
+      }
+      std::reverse(in_dim.begin(), in_dim.end());
+    }
+    std::reverse(out_dims.begin(), out_dims.end());
+  }
+
+  template <typename MergeFunctor>
+  __inline__ void MergeDimensions(MergeFunctor merge_func, int N) {
+    auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) {
+      (*vec)[m_idx - 1] =
+          std::accumulate(vec->begin() + l_idx, vec->begin() + m_idx, 1,
+                          std::multiplies<int64_t>());
+      vec->erase(vec->begin() + l_idx, vec->begin() + m_idx - 1);
+    };
+
+    int64_t i = 0;
+    while (i < dim_size) {
+      int cnt = 0;
+      int low_idx = i;
+      bool equal = true;
+      do {
+        merge_func(equal, in_dims, out_dims, i, N);
+        if (equal) {
+          i++;
+          cnt++;
+        } else {
+          break;
+        }
+      } while (i < dim_size);
+
+      if (cnt > 1) {
+        for (auto &in_dim : in_dims) {
+          VectorReorganise(&in_dim, low_idx, i);
+        }
+        VectorReorganise(&out_dims, low_idx, i);
+        dim_size -= --cnt;
+        i -= cnt;
+      } else if (cnt < 1) {
+        i++;
+      }
+    }
+  }
+
+ public:
+  explicit DimensionsTransform(
+      const std::vector<const framework::Tensor *> &ins,
+      const framework::DDim &dims, int axis) {
+    const int N = ins.size();
+    dim_size = dims.size();
+    out_dims = framework::vectorize<int64_t>(dims);
+    in_dims.resize(N);
+    for (int j = 0; j < N; ++j) {
+      in_dims[j] = framework::vectorize<int64_t>(ins[j]->dims());
+    }
+    InputDimensionsExtend(N, axis);
+
+    auto merge_sequential_dims = [](bool &equal,
+                                    std::vector<DimVector> &in_dims,
+                                    DimVector &out, int i, int num) {
+      for (int j = 1; j < num; ++j) {
+        equal = (in_dims[0][i] == in_dims[j][i]) ? true : false;
+      }
+    };
+    auto merge_sequential_one_dims = [](bool &equal,
+                                        std::vector<DimVector> &in_dims,
+                                        DimVector &out, int i, int num) {
+      equal = in_dims[0][i] == 1;
+      if (equal) {
+        for (int j = 1; j < num; ++j) {
+          equal = in_dims[j][i] == out[i];
+        }
+      }
+    };
+    // To Merge the dimensions of input_tensors while the consequtive
+    // equal-dimensions appears.
+    MergeFunctor merge_ptr = merge_sequential_dims;
+    MergeDimensions<MergeFunctor>(merge_ptr, N);
+
+    int min_idx = 0;
+    int min_val = std::accumulate(in_dims[0].begin(), in_dims[0].end(), 1,
+                                  std::multiplies<int64_t>());
+    for (int j = 1; j < N; ++j) {
+      int temp = std::accumulate(in_dims[j].begin(), in_dims[j].end(), 1,
+                                 std::multiplies<int64_t>());
+      min_val = min_val > temp ? temp : min_val;
+      min_idx = min_val == temp ? j : min_idx;
+    }
+    std::swap(in_dims[0], in_dims[min_idx]);
+
+    // To Merge the dimension of input_tensors while the consequtive
+    // 1-value-dimensions appears.
+    merge_ptr = merge_sequential_one_dims;
+    MergeDimensions<MergeFunctor>(merge_ptr, N);
+    std::swap(in_dims[min_idx], in_dims[0]);
+  }
+};
+
+struct StridesCalculation {
+  std::vector<std::vector<uint32_t>> strides;
+  std::vector<FastDivMod> divmoders;
+
+ private:
+  // To calculate the strides of each input_tensor.
+  __inline__ void CalculateStrides(
+      int N, int dim_size, const std::vector<std::vector<int64_t>> &in_dims) {
+    for (int j = 0; j < N; ++j) {
+      for (int i = 0; i < dim_size; ++i) {
+        strides[j][i] = in_dims[j][i] == 1 ? 0 : strides[j][i];
+        strides[j][i] =
+            (i != 0 && strides[j][i] != 0)
+                ? std::accumulate(in_dims[j].begin(), in_dims[j].begin() + i, 1,
+                                  std::multiplies<int64_t>())
+                : strides[j][i];
+      }
+    }
+  }
+
+ public:
+  explicit StridesCalculation(const int64_t &dim_size,
+                              const std::vector<std::vector<int64_t>> &in_dims,
+                              const std::vector<int64_t> &out_dims) {
+    const auto N = in_dims.size();
+    divmoders.resize(dim_size);
+    strides.resize(N, std::vector<uint32_t>(dim_size, 1));
+
+    for (int i = 0; i < dim_size; ++i) {
+      divmoders[i] = FastDivMod(out_dims[i]);
+    }
+    CalculateStrides(N, dim_size, in_dims);
+  }
+};
+
+template <typename InT, typename OutT, typename Functor, ElementwiseType ET,
+          int VecSize, int kDims>
+struct BroadcastArgsWarpper {
+  using InVecType = CudaAlignedVector<InT, VecSize>;
+  using OutVecType = CudaAlignedVector<OutT, VecSize>;
+
+  OutT *out_data;
+  OutVecType *vec_out_data;
+  const InT *__restrict__ in_data[ET];
+  const InVecType *__restrict__ vec_in_data[ET];
+  bool no_broadcast[ET];
+  FastDivMod divmoders[kDims];
+  uint32_t strides[ET][framework::DDim::kMaxRank];
+  uint32_t scalar_cal_offset;
+  Functor func;
+
+  HOSTDEVICE BroadcastArgsWarpper(
+      const std::vector<const framework::Tensor *> &ins, framework::Tensor *out,
+      int scalar_cal_offset, Functor func,
+      const StridesCalculation &offset_calculator)
+      : scalar_cal_offset(scalar_cal_offset), func(func) {
+    for (int j = 0; j < ET; ++j) {
+      in_data[j] = ins[j]->data<InT>();
+      vec_in_data[j] = reinterpret_cast<const InVecType *>(in_data[j]);
+      no_broadcast[j] = ins[j]->dims() == out->dims() ? true : false;
+      memcpy(strides[j], offset_calculator.strides[j].data(),
+             kDims * sizeof(uint32_t));
+    }
+    out_data = out->data<OutT>();
+    vec_out_data = reinterpret_cast<OutVecType *>(out_data);
+    memcpy(divmoders, offset_calculator.divmoders.data(),
+           kDims * sizeof(FastDivMod));
+  }
+
+  __device__ __forceinline__ uint32_t GetOffsetByDivmod(int idx, int in_idx) {
+    uint32_t offset = 0;
+
+#pragma unroll(kDims)
+    for (int i = 0; i < kDims; ++i) {
+      auto fast_divmoder = divmoders[i].Divmod(idx);
+      idx = fast_divmoder.val[0];
+      offset += fast_divmoder.val[1] * strides[in_idx][i];
+    }
+    return offset;
+  }
+
+  __device__ __forceinline__ void LoadVectorizedDataCommon(
+      InVecType *vector_args, int tid, int idx) {
+    *vector_args = vec_in_data[idx][tid];
+  }
+
+  __device__ __forceinline__ void LoadVectorizedDataByDivmod(InT *scalar_args,
+                                                             int tid, int idx) {
+    int index = tid * VecSize;
+#pragma unroll(VecSize)
+    for (int i = 0; i < VecSize; ++i) {
+      uint32_t offset = GetOffsetByDivmod(index + i, idx);
+      scalar_args[i] = in_data[idx][offset];
+    }
+  }
+
+  __device__ __forceinline__ void LoadScalarizedDataCommon(InT args[], int tid,
+                                                           int idx) {
+    args[idx] = in_data[idx][tid + scalar_cal_offset];
+  }
+
+  __device__ __forceinline__ void LoadScalarizedDataByDivmod(InT args[],
+                                                             int tid, int idx) {
+    auto offset = GetOffsetByDivmod(tid + scalar_cal_offset, idx);
+    args[idx] = in_data[idx][offset];
+  }
+
+  __device__ __forceinline__ void LoadVectorizedData(InT (*args)[VecSize],
+                                                     int tid) {
+#pragma unroll(ET)
+    for (int j = 0; j < ET; ++j) {
+      if (no_broadcast[j]) {
+        InVecType *vector_args = reinterpret_cast<InVecType *>(args[j]);
+        LoadVectorizedDataCommon(vector_args, tid, j);
+      } else {
+        LoadVectorizedDataByDivmod(args[j], tid, j);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void LoadScalarizedData(InT args[], int tid) {
+#pragma unroll(ET)
+    for (int j = 0; j < ET; ++j) {
+      if (no_broadcast[j]) {
+        LoadScalarizedDataCommon(args, tid, j);
+      } else {
+        LoadScalarizedDataByDivmod(args, tid, j);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void StoreVectorizedData(OutVecType vec_args_out,
+                                                      int tid) {
+    vec_out_data[tid] = vec_args_out;
+  }
+
+  __device__ __forceinline__ void StoreScalarizedData(OutT args_out, int tid) {
+    out_data[scalar_cal_offset + tid] = args_out;
+  }
+};
+
+template <typename InT, typename OutT, typename BroadcastArgsWarpper,
+          ElementwiseType ET>
+__device__ inline void ScalarizedBroadcastKernelImpl(
+    BroadcastArgsWarpper broadcast_warpper, int tid) {
+  InT args[ET];
+  OutT args_out;
+  broadcast_warpper.LoadScalarizedData(args, tid);
+
+#pragma unroll(ET)
+  for (int j = 1; j < ET; ++j) {
+    args_out = broadcast_warpper.func(args);
+  }
+  broadcast_warpper.StoreScalarizedData(args_out, tid);
+}
+
+template <typename InT, typename OutT, typename BroadcastArgsWarpper,
+          ElementwiseType ET, int VecSize>
+__device__ inline void VectorizedBroadcastKernelImpl(
+    BroadcastArgsWarpper broadcast_warpper, int tid) {
+  using OutVecType = CudaAlignedVector<OutT, VecSize>;
+  OutVecType args_out;
+  InT ins[ET];
+  InT args[ET][VecSize];
+  broadcast_warpper.LoadVectorizedData(args, tid);
+
+#pragma unroll(VecSize)
+  for (int i = 0; i < VecSize; ++i) {
+#pragma unroll(ET)
+    for (int j = 0; j < ET; ++j) {
+      ins[j] = args[j][i];
+    }
+    args_out.val[i] = broadcast_warpper.func(ins);
+  }
+  broadcast_warpper.StoreVectorizedData(args_out, tid);
+}
+
+template <typename InT, typename OutT, typename BroadcastArgsWarpper,
+          ElementwiseType ET, int VecSize>
+__global__ void ElementwiseBroadcastKernel(
+    BroadcastArgsWarpper broadcast_warpper, int main_tid, int tail_tid) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // Vectorized calculation of major data whose length is the max multipler of
+  // VecSize,
+  // eg: Calcualting the front 1024-length data in total 1027 data once VecSize
+  // is 4.
+  if (tid < main_tid) {
+    VectorizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWarpper, ET, VecSize>(
+        broadcast_warpper, tid);
+  }
+  // Scalarzed calculation of rest data whose lenght cannot fulfill VecSize.
+  // eg: Calcualting the rest 3-length data in total 1027 data once VecSize is
+  // 4.
+  if (tid < tail_tid) {
+    ScalarizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWarpper, ET>(
+        broadcast_warpper, tid);
+  }
+}
+
+template <typename InT, typename OutT, ElementwiseType ET, int VecSize,
+          typename Functor>
+void LaunchBroadcastKernelForDifferentDimSize(
+    const platform::CUDADeviceContext &ctx,
+    const std::vector<const framework::Tensor *> &ins, framework::Tensor *out,
+    int axis, Functor func) {
+  int numel = out->numel();
+  const int threads = 256;
+  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
+  int main_tid = numel / VecSize;
+  int tail_tid = numel % VecSize;
+  int vec_len = main_tid * VecSize;
+  auto stream = ctx.stream();
+
+  const auto merge_dims = DimensionsTransform(ins, out->dims(), axis);
+  const auto offset_calculator = StridesCalculation(
+      merge_dims.dim_size, merge_dims.in_dims, merge_dims.out_dims);
+
+  switch (merge_dims.dim_size) {
+    case 1: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 1>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 2: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 2>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 3: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 3>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 4: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 4>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 5: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 5>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 6: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 6>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 7: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 7>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 8: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 8>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    default: {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The maximum dimension of input tensor is expected to be less than "
+          "%d, but recieved %d.\n",
+          merge_dims.dim_size, framework::DDim::kMaxRank));
+    }
+  }
+}
+
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+void LaunchBroadcastElementwiseCudaKernel(
+    const platform::CUDADeviceContext &ctx,
+    const std::vector<const framework::Tensor *> &ins,
+    std::vector<framework::Tensor *> *outs, int axis, Functor func) {
+  static_assert(ET == (ElementwiseType)2, "Only Support binary calculation.");
+  int in_vec_size = 4;
+  framework::Tensor *out = (*outs)[0];
+  for (auto *in : ins) {
+    auto temp_size = GetVectorizedSizeImpl<InT>(in->data<InT>());
+    in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size)
+                                            : in_vec_size;
+  }
+  int out_vec_size = GetVectorizedSizeImpl<OutT>(out->data<OutT>());
+  int vec_size = std::min(out_vec_size, in_vec_size);
+
+  switch (vec_size) {
+    case 4: {
+      LaunchBroadcastKernelForDifferentDimSize<InT, OutT, ET, 4>(ctx, ins, out,
+                                                                 axis, func);
+      break;
+    }
+    case 2: {
+      LaunchBroadcastKernelForDifferentDimSize<InT, OutT, ET, 2>(ctx, ins, out,
+                                                                 axis, func);
+      break;
+    }
+    case 1: {
+      LaunchBroadcastKernelForDifferentDimSize<InT, OutT, ET, 1>(ctx, ins, out,
+                                                                 axis, func);
+      break;
+    }
+    default: {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+}
+
+template <ElementwiseType ET, typename InT, typename OutType, typename Functor>
+void LaunchElementwiseCudaKernel(
+    const platform::CUDADeviceContext &cuda_ctx,
+    const std::vector<const framework::Tensor *> &ins,
+    std::vector<framework::Tensor *> *outs, int axis, Functor func) {
+  bool no_broadcast_flag = true;
+  for (auto *in : ins) {
+    no_broadcast_flag = ins[0]->dims() == in->dims();
+  }
+
+  if (no_broadcast_flag) {
+    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, InT, OutType>(
+        cuda_ctx, ins, outs, func);
+  } else {
+    LaunchBroadcastElementwiseCudaKernel<ElementwiseType::kBinary, InT,
+                                         OutType>(cuda_ctx, ins, outs, axis,
+                                                  func);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 38b1afbdc3342e8bc4d9901b64bae808fd9d3915..33a2b7e182f0a06232015abf34b810bcc1ce39af 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -15,8 +15,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/fast_divmod.h"
 
 #ifdef __HIPCC__
 #define ELEMENTWISE_BLOCK_SIZE 256
@@ -29,11 +28,6 @@ namespace operators {
 
 enum ElementwiseType { kUnary = 1, kBinary = 2 };
 
-template <typename T, int Size>
-struct alignas(sizeof(T) * Size) CudaAlignedVector {
-  T val[Size];
-};
-
 template <typename T>
 int GetVectorizedSizeImpl(const T *pointer) {
   uint64_t address = reinterpret_cast<uint64_t>(pointer);
@@ -181,7 +175,7 @@ __global__ void ScalarKernel(const InT *__restrict__ in0,
 }
 
 template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
-void LaunchElementwiseCudaKernel(
+void LaunchSameDimsElementwiseCudaKernel(
     const platform::CUDADeviceContext &ctx,
     const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, Functor func) {
@@ -197,6 +191,7 @@ void LaunchElementwiseCudaKernel(
   OutT *out = (*outs)[0]->data<OutT>();
   // cuda kernel
   auto stream = ctx.stream();
+
   switch (vec_size) {
     case 4:
       VectorizedKernel<ET, 4><<<grid_size, block_size, 0, stream>>>(
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index 1951ed7f5da67316a11d0bbc96b902dbf9a4c440..84aa189b89e909f66c994bd765a3d192e393a1ea 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct complex128;
-struct complex64;
+template <typename T>
+struct complex;
 }  // namespace platform
 }  // namespace paddle
 
@@ -134,9 +134,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub_grad,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -144,9 +144,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex64>,
+                                  paddle::platform::complex<float>>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex128>);
+                                  paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub_grad_grad,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
@@ -158,9 +158,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         int64_t>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(elementwise_sub)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 192999fd2ac831e85d42a41e5a54754a49f4ddce..19cbbb7bf04287b49e023aaa10c9635b6c4fbda7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -14,8 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -103,9 +102,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_sub_grad,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
@@ -115,9 +114,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex64>,
+                                  paddle::platform::complex<float>>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex128>);
+                                  paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_sub_grad_grad,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
@@ -129,6 +128,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                         int64_t>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
index 97cd4d90be689ac7e891af9fe098b56bea000166..e9ad2895e03db8e77470c490453427a41d8e3bba 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
@@ -173,7 +173,9 @@ void FusedBatchNormActOpMaker::Make() {
       .AddCustomChecker([](const float &epsilon) {
         PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true,
                           platform::errors::InvalidArgument(
-                              "'epsilon' should be between 0.0 and 0.001."));
+                              "Attr(epsilon) should be between 0.0 and 0.001, "
+                              "but received value is %f.",
+                              epsilon));
       });
   AddAttr<std::string>("act_type", "The activation type to be fused.")
       .SetDefault("relu");
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
index b53b407d4995da5d548a13fec20ff3b09a5583c4..4d270280d389c6d8c34e3a5691a41a684b537577 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
@@ -25,11 +25,13 @@ class EmbeddingEltWiseLayerNormOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* context) const override {
-    PADDLE_ENFORCE_EQ(context->Inputs("Ids").size(),
-                      context->Inputs("Embs").size(),
-                      platform::errors::InvalidArgument(
-                          "Two inputs of EmbeddingEltWiseLayerNormOp shoube be "
-                          "the same size"));
+    PADDLE_ENFORCE_EQ(
+        context->Inputs("Ids").size(), context->Inputs("Embs").size(),
+        platform::errors::InvalidArgument(
+            "Two inputs of EmbeddingEltWiseLayerNormOp shoube be "
+            "the same size, but received the size of input Ids = %d,"
+            " the size of input Embs = %d",
+            context->Inputs("Ids").size(), context->Inputs("Embs").size()));
     PADDLE_ENFORCE_GE(context->Inputs("Embs").size(), 2UL,
                       platform::errors::InvalidArgument(
                           "Input Embs of EmbeddingEltWiseLayerNormOp should "
@@ -77,7 +79,8 @@ class EmbeddingEltWiseLayerNormOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           embs_dims[i][1], hidden,
           platform::errors::InvalidArgument(
-              "The Emb first dim size(%d) shoule equal to hidden (%d).",
+              "The second dimension size(%d) of the Embedding should be "
+              "equal to the hidden's size(%d)",
               embs_dims[i][1], hidden));
     }
 
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
index bd376b1e7aaefbf890e174cc86899b990a9fed26..382d01f6a535c76bdd38102a0cb40e5afc345f07 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
@@ -40,7 +40,9 @@ class TransposeFlattenConcatFusionOp : public framework::OperatorWithKernel {
     const size_t n = ins.size();
     PADDLE_ENFORCE_GT(n, 0,
                       platform::errors::InvalidArgument(
-                          "Input tensors dim size should greater than 0."));
+                          "The size of Inputs(X)'s dimension should be greater "
+                          " than 0, but received %d.",
+                          n));
 
     std::vector<int> trans_axis =
         ctx->Attrs().Get<std::vector<int>>("trans_axis");
diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc
index ae3d0f2633bb18d469b5f755fb81bafab5bab10d..6d1dac830405079feb9333c86b755682dcdba13c 100644
--- a/paddle/fluid/operators/gather_op_xpu.cc
+++ b/paddle/fluid/operators/gather_op_xpu.cc
@@ -40,16 +40,6 @@ class GatherOpXPUKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
-    // check index type is INT32
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "XPU only support INT32, it holds %s, but desires to be %s",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32)));
 
     const auto index_dims = index->dims();
     if (index_dims.size() == 2) {
@@ -65,14 +55,26 @@ class GatherOpXPUKernel : public framework::OpKernel<T> {
               "The index should be 1D, when it is not 2D, but we get %d",
               index_dims.size()));
     }
-    int slice_size = x->numel() / x->dims()[0];
+    std::vector<int> xshape(x->dims().size());
+    for (int i = 0; i < x->dims().size(); ++i) {
+      xshape[i] = x->dims()[i];
+    }
+
     auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
-    int r =
-        xpu::gather<T>(dev_ctx.x_context(), x->data<T>(), index->data<int>(),
-                       index->dims()[0], slice_size, output->data<T>());
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External("XPU kernel error! error code=%d", r));
+    int r = XPU_SUCCESS;
+    if (index->type() == framework::proto::VarType::INT32) {
+      r = xpu::gather<T, int>(dev_ctx.x_context(), x->data<T>(),
+                              index->data<int>(), output->data<T>(), xshape,
+                              index->dims()[0], 0);
+    } else {
+      r = xpu::gather<T, int64_t>(dev_ctx.x_context(), x->data<T>(),
+                                  index->data<int64_t>(), output->data<T>(),
+                                  xshape, index->dims()[0], 0);
+    }
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::External(
+                          "XPU gather kernel return wrong value[%d %s]", r,
+                          XPUAPIErrorMsg[r]));
   }
 };
 
@@ -93,30 +95,11 @@ class GatherGradOpXPUKernel : public framework::OpKernel<T> {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Now, it doesn't support XPU with Axis."));
     }
-
-    dx->mutable_data<T>(ctx.GetPlace());
-    const int zero = 0;
-    int r_dx = xpu::memset(dev_ctx.x_context(), dx->data<T>(), zero,
-                           dx->numel() * sizeof(T));
-    PADDLE_ENFORCE_EQ(
-        r_dx, xpu::Error_t::SUCCESS,
-        platform::errors::External("XPU kernel error! error code=%d", r_dx));
-
     if (dout->numel() == 0) {
       return;
     }
-    bool overwrite = ctx.Attr<bool>("overwrite");
-    // check index type is INT32
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "XPU only support INT32, it holds %s, but desires to be %s",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32)));
 
+    bool overwrite = ctx.Attr<bool>("overwrite");
     const auto index_dims = index->dims();
     if (index_dims.size() == 2) {
       PADDLE_ENFORCE_EQ(
@@ -131,16 +114,27 @@ class GatherGradOpXPUKernel : public framework::OpKernel<T> {
               "The index should be 1D, when it is not 2D, but we get %d",
               index_dims.size()));
     }
+    std::vector<int> xshape(dx->dims().size());
+    for (int i = 0; i < dx->dims().size(); ++i) {
+      xshape[i] = dx->dims()[i];
+    }
 
-    int index_size = index_dims[0];
-    int slice_size = dout->numel() / dout->dims()[0];
+    dx->mutable_data<T>(ctx.GetPlace());
 
-    int r = xpu::scatter<T>(dev_ctx.x_context(), dout->data<T>(),
-                            index->data<int>(), index_size, slice_size,
-                            dx->data<T>(), overwrite);
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External("XPU kernel error! error code=%d", r));
+    int r = XPU_SUCCESS;
+    if (index->type() == framework::proto::VarType::INT32) {
+      r = xpu::gather_grad<T, int>(dev_ctx.x_context(), dout->data<T>(),
+                                   index->data<int>(), dx->data<T>(), xshape,
+                                   index->dims()[0], 0, overwrite);
+    } else {
+      r = xpu::gather_grad<T, int64_t>(dev_ctx.x_context(), dout->data<T>(),
+                                       index->data<int64_t>(), dx->data<T>(),
+                                       xshape, index->dims()[0], 0, overwrite);
+    }
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::External(
+                          "XPU gather grad kernel return wrong value[%d %s]", r,
+                          XPUAPIErrorMsg[r]));
   }
 };
 
diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc
index 899025ae7093b45833805687c9d499e2d1fa02e7..6a195bb9400e89ef09bc7ca2c08637eeb505dda2 100644
--- a/paddle/fluid/operators/imag_op.cc
+++ b/paddle/fluid/operators/imag_op.cc
@@ -96,11 +96,11 @@ REGISTER_OPERATOR(imag, ops::ImagOp, ops::ImagOpMaker,
 REGISTER_OPERATOR(imag_grad, ops::ImagGradOp);
 
 REGISTER_OP_CPU_KERNEL(imag, ops::ImagKernel<paddle::platform::CPUDeviceContext,
-                                             paddle::platform::complex64>,
+                                             paddle::platform::complex<float>>,
                        ops::ImagKernel<paddle::platform::CPUDeviceContext,
-                                       paddle::platform::complex128>);
+                                       paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(imag_grad,
                        ops::ImagGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex64>,
+                                           paddle::platform::complex<float>>,
                        ops::ImagGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex128>);
+                                           paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/imag_op.cu b/paddle/fluid/operators/imag_op.cu
index a7a3b1368219891dc5d98e25f4c38be5ad216baf..9cfb2ef7f2fef6b25322ba76bedadae3c6ca8d87 100644
--- a/paddle/fluid/operators/imag_op.cu
+++ b/paddle/fluid/operators/imag_op.cu
@@ -18,11 +18,11 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(imag,
                         ops::ImagKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
                         ops::ImagKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(imag_grad,
                         ops::ImagGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex64>,
+                                            paddle::platform::complex<float>>,
                         ops::ImagGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex128>);
+                                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index 9574b325ef77fd22c2baeea1bc45469b14c597a1..87618b954d232dcfe5d0ed0b8062db7c324c1290 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -29,6 +29,11 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
     auto *output_t = ctx.Output<framework::LoDTensor>("Out");  // float tensor
     auto *table_t = ctx.Input<framework::LoDTensor>("W");
 
+    // It seems cann 20.1 accepts int64, but cann 20.2+ not.
+    PADDLE_ENFORCE_EQ(ids_t->type(), framework::proto::VarType::INT32,
+                      platform::errors::Unimplemented(
+                          "The index of LookupTableV2 should be int32."));
+
     auto *table_var = ctx.InputVar("W");
     PADDLE_ENFORCE_EQ(
         table_var->IsType<framework::LoDTensor>(), true,
diff --git a/paddle/fluid/operators/math/complex_functors.h b/paddle/fluid/operators/math/complex_functors.h
index 0e8aed40f6e16a6bd5395bdeadd49b80a132ae6f..f530256677854860fd7d3c6163a142ad8ba2da42 100644
--- a/paddle/fluid/operators/math/complex_functors.h
+++ b/paddle/fluid/operators/math/complex_functors.h
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include <type_traits>
 
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
@@ -66,7 +65,10 @@ using select_t = typename select<Head, Tail...>::type;
 template <typename T>
 using Real =
     select_t<cond<std::is_same<T, platform::complex64>::value, float>,
-             cond<std::is_same<T, platform::complex128>::value, double>, T>;
+             cond<std::is_same<T, platform::complex128>::value, double>,
+             cond<std::is_same<T, platform::complex<float>>::value, float>,
+             cond<std::is_same<T, platform::complex<double>>::value, double>,
+             T>;
 
 template <typename T, typename RealT>
 using Complex = typename std::enable_if<!std::is_same<T, RealT>::value>::type;
@@ -76,14 +78,18 @@ template <typename T, typename RealT>
 using NoComplex = typename std::enable_if<std::is_same<T, RealT>::value>::type;
 
 template <typename T>
-using EnableComplex =
-    typename std::enable_if<std::is_same<T, platform::complex64>::value ||
-                            std::is_same<T, platform::complex128>::value>::type;
+using EnableComplex = typename std::enable_if<
+    std::is_same<T, platform::complex64>::value ||
+    std::is_same<T, platform::complex128>::value ||
+    std::is_same<T, platform::complex<float>>::value ||
+    std::is_same<T, platform::complex<double>>::value>::type;
 
 template <typename T>
 using DisableComplex = typename std::enable_if<
     !std::is_same<T, platform::complex64>::value &&
-    !std::is_same<T, platform::complex128>::value>::type;
+    !std::is_same<T, platform::complex128>::value &&
+    !std::is_same<T, platform::complex<float>>::value &&
+    !std::is_same<T, platform::complex<double>>::value>::type;
 
 template <typename T, typename Enable = void>
 struct RealFunctor;
@@ -173,44 +179,45 @@ struct AbsGradFunctor {
 };
 
 template <>
-struct AbsGradFunctor<paddle::platform::complex64> {
-  AbsGradFunctor(const float* dout, const paddle::platform::complex64* x,
-                 paddle::platform::complex64* output, int64_t numel)
+struct AbsGradFunctor<paddle::platform::complex<float>> {
+  AbsGradFunctor(const float* dout, const paddle::platform::complex<float>* x,
+                 paddle::platform::complex<float>* output, int64_t numel)
       : dout_(dout), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
-    if (x_[idx] == paddle::platform::complex64(0)) {
-      output_[idx] = paddle::platform::complex64(0);
+    if (x_[idx] == paddle::platform::complex<float>(0)) {
+      output_[idx] = paddle::platform::complex<float>(0);
     } else {
-      output_[idx] = paddle::platform::complex64(dout_[idx]) *
-                     (x_[idx] / paddle::platform::complex64(abs(x_[idx])));
+      output_[idx] = paddle::platform::complex<float>(dout_[idx]) *
+                     (x_[idx] / paddle::platform::complex<float>(abs(x_[idx])));
     }
   }
 
   const float* dout_;
-  const paddle::platform::complex64* x_;
-  paddle::platform::complex64* output_;
+  const paddle::platform::complex<float>* x_;
+  paddle::platform::complex<float>* output_;
   int64_t numel_;
 };
 
 template <>
-struct AbsGradFunctor<paddle::platform::complex128> {
-  AbsGradFunctor(const double* dout, const paddle::platform::complex128* x,
-                 paddle::platform::complex128* output, int64_t numel)
+struct AbsGradFunctor<paddle::platform::complex<double>> {
+  AbsGradFunctor(const double* dout, const paddle::platform::complex<double>* x,
+                 paddle::platform::complex<double>* output, int64_t numel)
       : dout_(dout), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
-    if (x_[idx] == paddle::platform::complex128(0)) {
-      output_[idx] = paddle::platform::complex128(0);
+    if (x_[idx] == paddle::platform::complex<double>(0)) {
+      output_[idx] = paddle::platform::complex<double>(0);
     } else {
-      output_[idx] = paddle::platform::complex128(dout_[idx]) *
-                     (x_[idx] / paddle::platform::complex128(abs(x_[idx])));
+      output_[idx] =
+          paddle::platform::complex<double>(dout_[idx]) *
+          (x_[idx] / paddle::platform::complex<double>(abs(x_[idx])));
     }
   }
 
   const double* dout_;
-  const paddle::platform::complex128* x_;
-  paddle::platform::complex128* output_;
+  const paddle::platform::complex<double>* x_;
+  paddle::platform::complex<double>* output_;
   int64_t numel_;
 };
 
@@ -234,46 +241,46 @@ struct AbsGradGradFunctor {
 };
 
 template <>
-struct AbsGradGradFunctor<paddle::platform::complex128> {
-  AbsGradGradFunctor(const paddle::platform::complex128* ddx,
-                     const paddle::platform::complex128* x,
-                     paddle::platform::complex128* output, int64_t numel)
+struct AbsGradGradFunctor<paddle::platform::complex<double>> {
+  AbsGradGradFunctor(const paddle::platform::complex<double>* ddx,
+                     const paddle::platform::complex<double>* x,
+                     paddle::platform::complex<double>* output, int64_t numel)
       : ddx_(ddx), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
-    if (x_[idx] == paddle::platform::complex128(0)) {
-      output_[idx] = paddle::platform::complex128(0);
+    if (x_[idx] == paddle::platform::complex<double>(0)) {
+      output_[idx] = paddle::platform::complex<double>(0);
     } else {
-      output_[idx] = paddle::platform::complex128(ddx_[idx]) * x_[idx] /
-                     paddle::platform::complex128(abs(x_[idx]));
+      output_[idx] = paddle::platform::complex<double>(ddx_[idx]) * x_[idx] /
+                     paddle::platform::complex<double>(abs(x_[idx]));
     }
   }
 
-  const paddle::platform::complex128* ddx_;
-  const paddle::platform::complex128* x_;
-  paddle::platform::complex128* output_;
+  const paddle::platform::complex<double>* ddx_;
+  const paddle::platform::complex<double>* x_;
+  paddle::platform::complex<double>* output_;
   int64_t numel_;
 };
 
 template <>
-struct AbsGradGradFunctor<paddle::platform::complex64> {
-  AbsGradGradFunctor(const paddle::platform::complex64* ddx,
-                     const paddle::platform::complex64* x,
-                     paddle::platform::complex64* output, int64_t numel)
+struct AbsGradGradFunctor<paddle::platform::complex<float>> {
+  AbsGradGradFunctor(const paddle::platform::complex<float>* ddx,
+                     const paddle::platform::complex<float>* x,
+                     paddle::platform::complex<float>* output, int64_t numel)
       : ddx_(ddx), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
-    if (x_[idx] == paddle::platform::complex64(0)) {
-      output_[idx] = paddle::platform::complex64(0);
+    if (x_[idx] == paddle::platform::complex<float>(0)) {
+      output_[idx] = paddle::platform::complex<float>(0);
     } else {
-      output_[idx] = paddle::platform::complex64(ddx_[idx]) * x_[idx] /
-                     paddle::platform::complex64(abs(x_[idx]));
+      output_[idx] = paddle::platform::complex<float>(ddx_[idx]) * x_[idx] /
+                     paddle::platform::complex<float>(abs(x_[idx]));
     }
   }
 
-  const paddle::platform::complex64* ddx_;
-  const paddle::platform::complex64* x_;
-  paddle::platform::complex64* output_;
+  const paddle::platform::complex<float>* ddx_;
+  const paddle::platform::complex<float>* x_;
+  paddle::platform::complex<float>* output_;
   int64_t numel_;
 };
 template <typename T, typename Enable = void>
diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h
index d6ad3aec22b1fed22e317b9935be56172fe0ec8d..a79a9da0b30f2be8f591e584bc56740e38b34594 100644
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -65,16 +65,18 @@ class SplitFunctor {
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_ALL_TYPES(macro)            \
-  macro(int);                           \
-  macro(float);                         \
-  macro(double);                        \
-  macro(bool);                          \
-  macro(int64_t);                       \
-  macro(int16_t);                       \
-  macro(uint8_t);                       \
-  macro(int8_t);                        \
-  macro(::paddle::platform::float16);   \
-  macro(::paddle::platform::bfloat16);  \
-  macro(::paddle::platform::complex64); \
+#define FOR_ALL_TYPES(macro)                  \
+  macro(int);                                 \
+  macro(float);                               \
+  macro(double);                              \
+  macro(bool);                                \
+  macro(int64_t);                             \
+  macro(int16_t);                             \
+  macro(uint8_t);                             \
+  macro(int8_t);                              \
+  macro(::paddle::platform::float16);         \
+  macro(::paddle::platform::bfloat16);        \
+  macro(::paddle::platform::complex<float>);  \
+  macro(::paddle::platform::complex<double>); \
+  macro(::paddle::platform::complex64);       \
   macro(::paddle::platform::complex128)
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 56217b4dc7ef5a2adc96bfa9c27aeba33af57893..d01a39ecb7c931e855c0ed954a0bc5e23732d168 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -47,6 +47,10 @@ template struct SetConstant<platform::CPUDeviceContext, bool>;
 template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
 template struct SetConstant<platform::CPUDeviceContext, platform::complex64>;
 template struct SetConstant<platform::CPUDeviceContext, platform::complex128>;
+template struct SetConstant<platform::CPUDeviceContext,
+                            platform::complex<float>>;
+template struct SetConstant<platform::CPUDeviceContext,
+                            platform::complex<double>>;
 
 #ifdef PADDLE_WITH_XPU
 template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
@@ -59,6 +63,10 @@ template struct SetConstant<platform::XPUDeviceContext, int64_t>;
 template struct SetConstant<platform::XPUDeviceContext, bool>;
 template struct SetConstant<platform::XPUDeviceContext, platform::complex64>;
 template struct SetConstant<platform::XPUDeviceContext, platform::complex128>;
+template struct SetConstant<platform::XPUDeviceContext,
+                            platform::complex<float>>;
+template struct SetConstant<platform::XPUDeviceContext,
+                            platform::complex<double>>;
 #endif
 
 #define DEFINE_CPU_TRANS(RANK)                                                \
@@ -74,6 +82,10 @@ template struct SetConstant<platform::XPUDeviceContext, platform::complex128>;
   template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;       \
   template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;       \
   template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext,                       \
+                            platform::complex<float>, RANK>;                  \
+  template struct Transpose<platform::CPUDeviceContext,                       \
+                            platform::complex<double>, RANK>;                 \
   template struct Transpose<platform::CPUDeviceContext, platform::complex64,  \
                             RANK>;                                            \
   template struct Transpose<platform::CPUDeviceContext, platform::complex128, \
@@ -130,6 +142,8 @@ DEFINE_CPU_TRANS_NORMAL(uint8_t);
 DEFINE_CPU_TRANS_NORMAL(int8_t);
 DEFINE_CPU_TRANS_NORMAL(platform::complex64);
 DEFINE_CPU_TRANS_NORMAL(platform::complex128);
+DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
+DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
 
 struct TensorSetConstantCPU {
   TensorSetConstantCPU(framework::Tensor* tensor, float value)
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index f94c1bf696cdad5727fcf9ae659c1430b0f8bef4..c5c78c87f7977234fe25b2c1774157e2f55a8c84 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -43,6 +43,10 @@ template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
 template struct SetConstant<platform::CUDADeviceContext, platform::complex64>;
 template struct SetConstant<platform::CUDADeviceContext, platform::complex128>;
+template struct SetConstant<platform::CUDADeviceContext,
+                            platform::complex<float>>;
+template struct SetConstant<platform::CUDADeviceContext,
+                            platform::complex<double>>;
 
 #define DEFINE_GPU_TRANS(RANK)                                             \
   template struct Transpose<platform::CUDADeviceContext, float, RANK>;     \
@@ -52,6 +56,10 @@ template struct SetConstant<platform::CUDADeviceContext, platform::complex128>;
   template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;    \
   template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>;   \
   template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext,                   \
+                            paddle::platform::complex<float>, RANK>;       \
+  template struct Transpose<platform::CUDADeviceContext,                   \
+                            paddle::platform::complex<double>, RANK>;      \
   template struct Transpose<platform::CUDADeviceContext, complex64, RANK>; \
   template struct Transpose<platform::CUDADeviceContext, complex128, RANK>;
 
@@ -145,6 +153,8 @@ DEFINE_GPU_TRANS_NORMAL(uint8_t);
 DEFINE_GPU_TRANS_NORMAL(int8_t);
 DEFINE_GPU_TRANS_NORMAL(complex64);
 DEFINE_GPU_TRANS_NORMAL(complex128);
+DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<float>);
+DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<double>);
 
 struct TensorSetConstantGPU {
   TensorSetConstantGPU(const platform::DeviceContext& context,
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index c12aecc9ba5160b532c5bb35e2564209946b7f42..e226ab532884446490d292c656c53c853327576d 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -825,6 +825,21 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
       context->SetOutputDim(y_grad_name, y_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
index 4ffcbaf55314a46888e15572e8477054b23ae2bb..c18b8590db18da7d21864ee7b4f32a831a6daa5f 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -23,91 +23,112 @@ template <typename DeviceContext, typename T>
 class AccuracyNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* pred = ctx.Input<Tensor>("Out");
+    auto* inference = ctx.Input<Tensor>("Out");
     auto* label = ctx.Input<Tensor>("Label");
-    // auto* logits = ctx.Input<Tensor>("Indices");
+    auto* indices = ctx.Input<Tensor>("Indices");
 
-    auto* acc = ctx.Output<Tensor>("Accuracy");
+    auto* accuracy = ctx.Output<Tensor>("Accuracy");
     auto* correct = ctx.Output<Tensor>("Correct");
     auto* total = ctx.Output<Tensor>("Total");
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    // cast pred
-    Tensor tmp_pred(pred->type());
-    tmp_pred.Resize(pred->dims());
-    tmp_pred.mutable_data<int>(ctx.GetPlace());
-    auto runner_cast_pred =
-        NpuOpRunner("Cast", {*pred}, {tmp_pred},
-                    {{"dst_type", static_cast<int>(ACL_INT32)}});
-    runner_cast_pred.Run(stream);
-
-    // cast label
-    Tensor tmp_label(label->type());
-    tmp_label.Resize(label->dims());
-    tmp_label.mutable_data<int>(ctx.GetPlace());
-    auto runner_cast_label =
-        NpuOpRunner("Cast", {*label}, {tmp_label},
-                    {{"dst_type", static_cast<int>(ACL_INT32)}});
-    runner_cast_label.Run(stream);
+    int num_samples = inference->dims()[0];
+    if (num_samples == 0) {
+      return;
+    }
+
+    // cast `indices` or `label` if their type is not consistent
+    Tensor cast_indices(framework::proto::VarType::INT32);
+    Tensor cast_label(framework::proto::VarType::INT32);
+    if (indices->type() != label->type()) {
+      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
+      if (indices->type() != framework::proto::VarType::INT32) {
+        cast_indices.Resize(indices->dims());
+        cast_indices.mutable_data<int>(ctx.GetPlace());
+        auto runner_cast_indices =
+            NpuOpRunner("Cast", {*indices}, {cast_indices},
+                        {{"dst_type", static_cast<int>(dst_dtype)}});
+        runner_cast_indices.Run(stream);
+      } else {
+        cast_indices.ShareDataWith(*indices);
+      }
+      if (label->type() != framework::proto::VarType::INT32) {
+        cast_label.Resize(label->dims());
+        cast_label.mutable_data<int>(ctx.GetPlace());
+        auto runner_cast_label =
+            NpuOpRunner("Cast", {*label}, {cast_label},
+                        {{"dst_type", static_cast<int>(dst_dtype)}});
+        runner_cast_label.Run(stream);
+      } else {
+        cast_label.ShareDataWith(*label);
+      }
+    } else {
+      cast_indices.ShareDataWith(*indices);
+      cast_label.ShareDataWith(*label);
+    }
 
     // equal
-    Tensor tmp_equal(label->type());
-    tmp_equal.Resize(label->dims());
+    Tensor tmp_equal(framework::proto::VarType::BOOL);
+    tmp_equal.Resize(inference->dims());
     tmp_equal.mutable_data<bool>(ctx.GetPlace());
     auto runner_equal =
-        NpuOpRunner("Equal", {tmp_pred, tmp_label}, {tmp_equal}, {});
+        NpuOpRunner("Equal", {cast_indices, cast_label}, {tmp_equal}, {});
     runner_equal.Run(stream);
 
     // cast equal
-    Tensor tmp_equal_cast(label->type());
-    tmp_equal_cast.Resize(label->dims());
+    Tensor tmp_equal_cast(framework::proto::VarType::FP32);
+    tmp_equal_cast.Resize(inference->dims());
     tmp_equal_cast.mutable_data<float>(ctx.GetPlace());
-    auto runner_cast_equal =
-        NpuOpRunner("Cast", {tmp_equal}, {tmp_equal_cast},
-                    {{"dst_type", static_cast<float>(ACL_FLOAT)}});
+    auto runner_cast_equal = NpuOpRunner(
+        "Cast", {tmp_equal}, {tmp_equal_cast},
+        {{"dst_type",
+          static_cast<int>(ConvertToNpuDtype(tmp_equal_cast.type()))}});
     runner_cast_equal.Run(stream);
 
-    // acc
-    acc->mutable_data<float>(ctx.GetPlace());
-    std::vector<int> axes_vec_1;
-    auto runner_acc = NpuOpRunner("ReduceMeanD", {tmp_equal_cast}, {*acc},
-                                  {{"keep_dims", false}, {"axes", axes_vec_1}});
-    runner_acc.Run(stream);
-
-    // correct
-    correct->mutable_data<float>(ctx.GetPlace());
-    std::vector<int> axes_vec_2;
-    auto runner_correct =
-        NpuOpRunner("ReduceSumD", {tmp_equal_cast}, {*correct},
-                    {{"keep_dims", false}, {"axes", axes_vec_2}});
-    runner_correct.Run(stream);
-
-    // ones_tensor
-    Tensor ones_tensor(label->type());
-    ones_tensor.Resize(label->dims());
-    ones_tensor.mutable_data<int>(ctx.GetPlace());
-    auto runner_oneslike =
-        NpuOpRunner("OnesLike", {tmp_label}, {ones_tensor}, {});
-    runner_oneslike.Run(stream);
-
-    // ones_tensor_cast
-    Tensor ones_tensor_cast(label->type());
-    ones_tensor_cast.Resize(label->dims());
-    ones_tensor_cast.mutable_data<float>(ctx.GetPlace());
-    auto runner_ones_cast =
-        NpuOpRunner("Cast", {ones_tensor}, {ones_tensor_cast},
-                    {{"dst_type", static_cast<float>(ACL_FLOAT)}});
-    runner_ones_cast.Run(stream);
-
-    // total
-    total->mutable_data<float>(ctx.GetPlace());
-    std::vector<int> axes_vec_3;
-    auto runner_total =
-        NpuOpRunner("ReduceSumD", {ones_tensor_cast}, {*total},
-                    {{"keep_dims", false}, {"axes", axes_vec_3}});
-    runner_total.Run(stream);
+    // [correct]
+    // reduce_max
+    Tensor tmp_correct_max(framework::proto::VarType::FP32);
+    tmp_correct_max.Resize(framework::make_ddim({num_samples}));
+    tmp_correct_max.mutable_data<float>(ctx.GetPlace());
+    auto runner_reduce_max =
+        NpuOpRunner("ReduceMaxD", {tmp_equal_cast}, {tmp_correct_max},
+                    {{"axes", std::vector<int>{1}}, {"keep_dims", false}});
+    runner_reduce_max.Run(stream);
+
+    // reduce_sum
+    Tensor tmp_correct(framework::proto::VarType::FP32);
+    tmp_correct.Resize(correct->dims());
+    tmp_correct.mutable_data<float>(ctx.GetPlace());
+    auto runner_reduce_sum =
+        NpuOpRunner("ReduceSumD", {tmp_correct_max}, {tmp_correct},
+                    {{"axes", std::vector<int>{0}}, {"keep_dims", false}});
+    runner_reduce_sum.Run(stream);
+
+    // cast to int
+    correct->mutable_data<int>(ctx.GetPlace());
+    auto runner_cast_correct = NpuOpRunner(
+        "Cast", {tmp_correct}, {*correct},
+        {{"dst_type", static_cast<int>(ConvertToNpuDtype(correct->type()))}});
+    runner_cast_correct.Run(stream);
+
+    // [total]
+    total->mutable_data<int>(ctx.GetPlace());
+    FillNpuTensorWithConstant<int>(total, static_cast<int>(num_samples));
+
+    // use `total` of type `float32` for calculating accuracy
+    Tensor tmp_total(framework::proto::VarType::FP32);
+    tmp_total.Resize(total->dims());
+    tmp_total.mutable_data<float>(ctx.GetPlace());
+    FillNpuTensorWithConstant<float>(&tmp_total,
+                                     static_cast<float>(num_samples));
+
+    // [accuracy]
+    accuracy->mutable_data<float>(ctx.GetPlace());
+    auto runner_accuracy =
+        NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {});
+    runner_accuracy.Run(stream);
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cfeace6bef99f98fcaa79dae5ba2ff1885092aa
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+
+template <typename T>
+class CastMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    int in_dtype = ctx.Attr<int>("in_dtype");
+    int out_dtype = ctx.Attr<int>("out_dtype");
+
+    auto x_paddle_type = framework::proto::VarType::Type(in_dtype);
+    auto out_paddle_type = framework::proto::VarType::Type(out_dtype);
+
+    mkldnn::memory::data_type x_type =
+        framework::ToMKLDNNDataType(x_paddle_type);
+    mkldnn::memory::data_type out_type =
+        framework::ToMKLDNNDataType(out_paddle_type);
+
+    auto x_tz = framework::vectorize(x->dims());
+
+    std::string key =
+        platform::CreateKey(dev_ctx, x_tz, x->format(), x->format(), x_type);
+    platform::ReorderMKLDNNHandler reorder_handler(
+        x_tz, x_paddle_type, x_type, out_paddle_type, out_type, dev_ctx,
+        dev_ctx.GetEngine(), key);
+
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        x->format(), platform::to_void_cast(x->data<T>()));
+    auto reorder_dst_memory_p =
+        reorder_handler.AcquireDstMemory(out, x->format(), dev_ctx.GetPlace());
+    auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p,
+                                                    reorder_src_memory_p);
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(cast, MKLDNN, paddle::platform::CPUPlace,
+                   ops::CastMKLDNNKernel<float>,
+                   ops::CastMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
index e2e9d280027b6a30958b308429cbb21d61fb2c08..b6b0b486bf060b9f163759540dfa3b17bbe68cb2 100644
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -14,21 +14,104 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-namespace platform {
-class MKLDNNDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 
 using paddle::framework::Tensor;
 using paddle::platform::MKLDNNDeviceContext;
 
+template <typename T>
+class LRNMKLDNNHandler : public platform::MKLDNNHandlerT<T, mkldnn::lrn_forward,
+                                                         mkldnn::lrn_backward> {
+ public:
+  LRNMKLDNNHandler(const framework::ExecutionContext& ctx,
+                   const MKLDNNDeviceContext& dev_ctx,
+                   const mkldnn::engine mkldnn_engine,
+                   platform::Place cpu_place, const Tensor* input,
+                   const std::string& unique_name)
+
+      : platform::MKLDNNHandlerT<T, mkldnn::lrn_forward, mkldnn::lrn_backward>(
+            dev_ctx, mkldnn_engine, cpu_place,
+            platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
+                                unique_name)) {
+    if (!this->isCachedNonBlocking()) {
+      const int n = ctx.Attr<int>("n");
+      // MKL-DNN implements LRN in a caffe way:
+      // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html
+      // Where sum of squares is divided by size of normalization window
+      // this is not the case for PaddlePaddle LRN.
+      // Hence we need to compensate for this diffrence by
+      // multipliing alpha by size of window(n)
+      const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
+      const float beta = ctx.Attr<float>("beta");
+      const float k = ctx.Attr<float>("k");
+      bool is_test = ctx.Attr<bool>("is_test");
+
+      auto dims = framework::vectorize(input->dims());
+
+      auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(),
+                                         input->format());
+
+      this->AcquireForwardPrimitiveDescriptorNonBlocking(
+          is_test ? mkldnn::prop_kind::forward_inference
+                  : mkldnn::prop_kind::forward_training,
+          mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k);
+    }
+  }
+
+  LRNMKLDNNHandler(const framework::ExecutionContext& ctx,
+                   const MKLDNNDeviceContext& dev_ctx,
+                   platform::Place cpu_place, const Tensor* in_x,
+                   const Tensor* out_grad, Tensor* in_x_grad,
+                   const std::string& unique_name)
+      : platform::MKLDNNHandlerT<T, mkldnn::lrn_forward, mkldnn::lrn_backward>(
+            dev_ctx, dev_ctx.GetEngine(), cpu_place,
+            platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()),
+                                unique_name)) {
+    if (!this->isBwdCached()) {
+      PADDLE_ENFORCE_EQ(
+          ctx.Attr<bool>("is_test"), false,
+          platform::errors::PreconditionNotMet(
+              "is_test attribute should be set to False in training phase."));
+
+      const int n = ctx.Attr<int>("n");
+      const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
+      const float beta = ctx.Attr<float>("beta");
+      const float k = ctx.Attr<float>("k");
+
+      auto dims = framework::vectorize<int64_t>(in_x->dims());
+
+      auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(),
+                                         in_x->format());
+      auto diff_md = mkldnn::memory::desc(
+          dims, platform::MKLDNNGetDataType<T>(), out_grad->format());
+
+      this->AcquireForwardPrimitiveDescriptorNonBlocking(
+          mkldnn::prop_kind::forward_training,
+          mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k);
+
+      this->AcquireBackwardPrimitiveDescriptorNonBlocking(
+          mkldnn::algorithm::lrn_across_channels, src_md, diff_md, n, alpha,
+          beta, k);
+    }
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWorkspaceMemory(Tensor* workspace) {
+    T* ptr = workspace->mutable_data<T>(
+        this->place_, this->fwd_pd_->workspace_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(),
+                                            ptr, "@wrk_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireBackwardWorkspaceMemory(
+      const Tensor* workspace) {
+    const T* workspace_data = workspace->data<T>();
+    return this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->workspace_desc(),
+        platform::to_void_cast<T>(workspace_data), "@bwd-wrk_mem_p");
+  }
+};
+
 template <typename T>
 class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -48,8 +131,8 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto out = ctx.Output<Tensor>("Out");
     auto mid = ctx.Output<Tensor>("MidOut");
 
-    platform::LRNMKLDNNHandler<T> handler(
-        ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, ctx.OutputName("Out"));
+    LRNMKLDNNHandler<T> handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), x,
+                                ctx.OutputName("Out"));
 
     auto src_memory = handler.AcquireSrcMemory(x);
     auto dst_memory = handler.AcquireDstMemory(out);
@@ -87,34 +170,22 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
                       paddle::platform::errors::PreconditionNotMet(
                           "Operator DNNL LRNGrad must use CPUPlace"));
-    PADDLE_ENFORCE_EQ(
-        ctx.Attr<bool>("is_test"), false,
-        platform::errors::PreconditionNotMet(
-            "is_test attribute should be set to False in training phase."));
 
-    auto x = ctx.Input<Tensor>("X");
+    auto in_x = ctx.Input<Tensor>("X");
     auto mid = ctx.Input<Tensor>("MidOut");
 
     auto out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    const int n = ctx.Attr<int>("n");
-    const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
-    const float beta = ctx.Attr<float>("beta");
-    const float k = ctx.Attr<float>("k");
+    auto in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
 
-    auto dims = paddle::framework::vectorize<int64_t>(x->dims());
+    LRNMKLDNNHandler<T> handler(ctx, dev_ctx, ctx.GetPlace(), in_x, out_grad,
+                                in_x_grad, ctx.InputName("Out"));
 
-    platform::LRNMKLDNNHandler<T> handler(dims, n, alpha, beta, k, x->format(),
-                                          out_grad->format(), dev_ctx,
-                                          ctx.GetPlace(), ctx.InputName("Out"));
-
-    auto src_memory = handler.AcquireSrcMemory(x);
+    auto src_memory = handler.AcquireSrcMemory(in_x);
     auto workspace = handler.AcquireBackwardWorkspaceMemory(mid);
     auto diff_dst_memory = handler.AcquireDiffDstMemory(out_grad);
-    auto diff_src_memory = handler.AcquireDiffSrcMemory(x_grad);
+    auto diff_src_memory = handler.AcquireDiffSrcMemory(in_x_grad);
 
     auto lrn_bwd = handler.AcquireBackwardPrimitive();
 
@@ -125,8 +196,8 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                {MKLDNN_ARG_WORKSPACE, *workspace}});
     astream.wait();
 
-    x_grad->set_layout(framework::DataLayout::kMKLDNN);
-    x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory));
+    in_x_grad->set_layout(framework::DataLayout::kMKLDNN);
+    in_x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
index 3ef9d88e4e91e17eb9fbaeac1bcbed53a1bac09e..2b3496359b0c66cccc40ed676cfc462d8148a11c 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace platform {
@@ -37,6 +37,111 @@ using platform::MKLDNNGetDataType;
 using platform::to_void_cast;
 using Tensor = framework::Tensor;
 
+// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
+// Identity op if the tensor is not of rank 3.
+static framework::Tensor FoldOuterDims(const Tensor& input) {
+  auto output = input;
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
+  }
+  return output;
+}
+
+// Reshape a rank-3 tensor from P x M x N to M x (P * N).
+// (Warning: This requires transposing data and writes into new memory.)
+// Identity op if the tensor is not of rank 3.
+template <typename T>
+static framework::Tensor FoldFirstAndLastDims(
+    const MKLDNNDeviceContext& dev_ctx, const Tensor* input) {
+  auto input_dims = framework::vectorize(input->dims());
+  if (input_dims.size() != 3) {
+    return *input;
+  }
+
+  framework::Tensor output;
+  output.Resize({input_dims[1], input_dims[0], input_dims[2]});
+
+  auto output_dims = framework::vectorize(output.dims());
+
+  memory::data_type input_type = framework::ToMKLDNNDataType(input->type());
+  std::string key = platform::CreateKey(dev_ctx, input_dims, input->format(),
+                                        input->format(), input_type);
+  platform::ReorderMKLDNNHandler reorder_handler(output_dims, input->type(),
+                                                 input_type, dev_ctx,
+                                                 dev_ctx.GetEngine(), key);
+
+  auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+      memory::format_tag::abc, platform::to_void_cast(input->data<T>()));
+  auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+      &output, memory::format_tag::bac, dev_ctx.GetPlace());
+  auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
+                                                  reorder_dst_memory_p);
+
+  platform::RecordEvent record_reorder("int_reorder",
+                                       platform::EventRole::kUniqueOp);
+
+  auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+  astream.wait();
+
+  output.Resize({input_dims[1], input_dims[0] * input_dims[2]});
+  return output;
+}
+
+template <typename T>
+class MatMulMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::matmul> {
+ public:
+  MatMulMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
+                      const mkldnn::engine engine, platform::Place cpu_place,
+                      Tensor* x, bool trans_x, Tensor* y, bool trans_y,
+                      Tensor* out, float scale, const std::string& uniq_name)
+      : platform::MKLDNNHandlerT<T, dnnl::matmul>(
+            dev_ctx, engine, cpu_place,
+            platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
+                                uniq_name)) {
+    if (!this->isCached()) {
+      auto mat_dim_x = math::CreateMatrixDescriptor(x->dims(), 0, trans_x);
+      auto mat_dim_y = math::CreateMatrixDescriptor(y->dims(), 0, trans_y);
+
+      memory::dim x_bs = mat_dim_x.batch_size_;
+      memory::dim y_bs = mat_dim_y.batch_size_;
+
+      memory::dim out_bs = x_bs || y_bs ? std::max(x_bs, y_bs) : 1;
+      const memory::dim M = mat_dim_x.height_;
+      const memory::dim N = mat_dim_y.width_;
+      const memory::dim K = mat_dim_x.width_;
+
+      memory::dims x_dims = {x_bs > 0 ? x_bs : 1, M, K};
+      memory::dims y_dims = {y_bs > 0 ? y_bs : 1, K, N};
+      memory::dims out_dims = {out_bs, M, N};
+
+      memory::dims x_strides =
+          !trans_x ? memory::dims{M * K, K, 1} : memory::dims{M * K, 1, M};
+
+      memory::dims y_strides =
+          !trans_y ? memory::dims{N * K, N, 1} : memory::dims{N * K, 1, K};
+      memory::dims out_strides = memory::dims{M * N, N, 1};
+
+      auto x_md = memory::desc(x_dims, MKLDNNGetDataType<T>(), x_strides);
+      auto y_md = memory::desc(y_dims, MKLDNNGetDataType<T>(), y_strides);
+      auto out_md = memory::desc(out_dims, MKLDNNGetDataType<T>(), out_strides);
+
+      dnnl::primitive_attr attrs;
+      if (scale != 1.0f) attrs.set_output_scales(0, {scale});
+
+      this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md);
+    }
+  }
+
+  std::shared_ptr<memory> AcquireWeightsMemory(const Tensor* input) {
+    const T* input_data = input->data<T>();
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(),
+                                            to_void_cast<T>(input_data),
+                                            "@weights_mem_p");
+  }
+};
+
 template <typename T>
 constexpr bool IsInt8() {
   return std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
@@ -44,7 +149,7 @@ constexpr bool IsInt8() {
 
 template <typename T>
 constexpr bool IsBfloat16() {
-  return std::is_same<T, paddle::platform::bfloat16>::value;
+  return std::is_same<T, platform::bfloat16>::value;
 }
 
 // Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
@@ -60,6 +165,60 @@ static framework::DDim ColumnMatrixDimsFromVector(
   return y_dim.size() > 1 ? y_dim : framework::make_ddim({y_dim[0], 1});
 }
 
+/**
+ * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor.
+ *
+ * The shape would be [BatchSize, H, W] or [H, W].
+ * If transposed, `H,W` will be swapped.
+ */
+static void ReshapeTensorToMatrixSequence(
+    framework::Tensor* x, const math::MatDescriptor& descriptor) {
+  int64_t h, w;
+  h = descriptor.height_;
+  w = descriptor.width_;
+  if (descriptor.trans_) {
+    std::swap(w, h);
+  }
+  if (descriptor.batch_size_) {
+    x->Resize({descriptor.batch_size_, h, w});
+  } else {
+    x->Resize({h, w});
+  }
+}
+
+/**
+ * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor
+ * Out = matmul(x, y)
+ *
+ * This method will first calculate X,Y matrix sequence, and then calculate
+ * the out shape.
+ *
+ * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2]
+ * The out = [BatchSize, H1, W2]
+ *
+ * If there is no batch size in `X` and `Y`, the out will be [H1, W2]
+ * If any of `X` and `Y` has batch size BatchSize, the out will have the
+ * BatchSize.
+ */
+static void ReshapeXYOutToMatrixSequence(framework::Tensor* x,
+                                         framework::Tensor* y,
+                                         framework::Tensor* out, bool trans_x,
+                                         bool trans_y) {
+  auto x_dim = RowMatrixDimsFromVector(x->dims());
+  auto y_dim = ColumnMatrixDimsFromVector(y->dims());
+  auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
+  auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
+  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
+    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
+  } else {
+    out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
+                 mat_dim_x.height_, mat_dim_y.width_});
+  }
+
+  ReshapeTensorToMatrixSequence(x, mat_dim_x);
+  ReshapeTensorToMatrixSequence(y, mat_dim_y);
+}
+
 template <typename XT, typename YT, typename OT>
 class MatMulFactory {
  public:
@@ -372,7 +531,7 @@ static void ExecuteMatMul(const ExecutionContext& ctx) {
 template <typename T>
 class DNNLMatMulKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const ExecutionContext& ctx) const override {
     if (ctx.HasAttr("head_number")) {
       PADDLE_ENFORCE_EQ(
           ctx.Attr<int>("head_number"), 1,
@@ -385,6 +544,137 @@ class DNNLMatMulKernel : public framework::OpKernel<T> {
     ExecuteMatMul<T, T>(ctx);
   }
 };
+
+template <typename T>
+class MatMulGradMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const ExecutionContext& ctx) const override {
+    if (ctx.HasAttr("head_number")) {
+      PADDLE_ENFORCE_EQ(
+          ctx.Attr<int>("head_number"), 1,
+          platform::errors::Unimplemented(
+              "DNNL matmul doesn't support multiple heads. Expected "
+              "head_number=1. But received `head_number` is %d",
+              ctx.Attr<int>("head_number")));
+    }
+    RunKernel<T>(ctx);
+  }
+
+ private:
+  void ExecuteMatMulGrad(const ExecutionContext& ctx,
+                         const MKLDNNDeviceContext& dev_ctx,
+                         const mkldnn::engine& engine, Tensor* x, bool trans_x,
+                         bool is_fold_init_dims_x, Tensor* y, bool trans_y,
+                         bool is_fold_init_dims_y, Tensor* out,
+                         int execution_number) const {
+    // gradient is calculated in a different way when broadcasting is used
+    bool need_combine = (x->dims().size() == 3 || y->dims().size() == 3) &&
+                        out->dims().size() == 2;
+
+    Tensor x_combined, y_combined;
+    if (!need_combine) {
+      x_combined = *x;
+      y_combined = *y;
+    } else {
+      x_combined = is_fold_init_dims_x ? FoldOuterDims(*x)
+                                       : FoldFirstAndLastDims<T>(dev_ctx, x);
+      y_combined = is_fold_init_dims_y ? FoldOuterDims(*y)
+                                       : FoldFirstAndLastDims<T>(dev_ctx, y);
+    }
+
+    MatMulMKLDNNHandler<T> handler(
+        dev_ctx, engine, ctx.GetPlace(), &x_combined, trans_x, &y_combined,
+        trans_y, out, ctx.Attr<float>("alpha"),
+        ctx.InputName(framework::GradVarName("Out")) +
+            std::to_string(execution_number));
+
+    const auto src_memory_p = handler.AcquireSrcMemory(&x_combined);
+    const auto weights_memory_p = handler.AcquireWeightsMemory(&y_combined);
+    const auto dst_memory_p = handler.AcquireDstMemory(out);
+
+    auto matmul_p = handler.AcquireForwardPrimitive();
+
+    std::unordered_map<int, dnnl::memory> matmul_args = {
+        {DNNL_ARG_SRC, *src_memory_p},
+        {DNNL_ARG_WEIGHTS, *weights_memory_p},
+        {DNNL_ARG_DST, *dst_memory_p}};
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    matmul_p->execute(astream, matmul_args);
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(platform::GetMKLDNNFormat(dst_memory_p->get_desc().reshape(
+        framework::vectorize<int64_t>(out->dims()))));
+  }
+
+  template <typename Tout = T>
+  void RunKernel(const ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    auto x = *ctx.Input<Tensor>("X");
+    auto y = *ctx.Input<Tensor>("Y");
+    auto dout = *ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    bool transpose_x = ctx.Attr<bool>("transpose_X");
+    bool transpose_y = ctx.Attr<bool>("transpose_Y");
+
+    ReshapeXYOutToMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+    framework::DDim dx_dims;
+    if (dx) {
+      dx_dims = dx->dims();
+      if (dx_dims != x.dims()) {
+        dx->Resize(x.dims());
+      }
+    }
+
+    framework::DDim dy_dims;
+    if (dy) {
+      dy_dims = dy->dims();
+      if (dy_dims != y.dims()) {
+        dy->Resize(y.dims());
+      }
+    }
+
+    if (transpose_x && transpose_y) {
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &y, true, true,
+                              &dout, true, false, dx, 0);
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, true, true,
+                              &x, true, false, dy, 1);
+    } else if (transpose_x) {
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &y, false, false,
+                              &dout, true, false, dx, 0);
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &x, false, false,
+                              &dout, false, true, dy, 1);
+    } else if (transpose_y) {
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, false, false,
+                              &y, false, true, dx, 0);
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, true, true,
+                              &x, false, true, dy, 1);
+    } else {
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, false, false,
+                              &y, true, false, dx, 0);
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &x, true, true,
+                              &dout, false, true, dy, 1);
+    }
+
+    if (dx) {
+      if (dx_dims != x.dims()) {
+        dx->Resize(dx_dims);
+      }
+    }
+    if (dy) {
+      if (dy_dims != y.dims()) {
+        dy->Resize(dy_dims);
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
@@ -394,3 +684,7 @@ REGISTER_OP_KERNEL(matmul, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::DNNLMatMulKernel<paddle::platform::bfloat16>,
                    ops::DNNLMatMulKernel<int8_t>,
                    ops::DNNLMatMulKernel<uint8_t>);
+
+REGISTER_OP_KERNEL(matmul_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::MatMulGradMKLDNNKernel<float>,
+                   ops::MatMulGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index b7bed95b1d33583682b997def63bb38243d1794d..04e0bcbfc7ce3550c6a874d1949094cea09661a1 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -43,7 +43,7 @@ class PoolingMKLDNNHandler
             platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
                                 framework::ToMKLDNNDataType(input->type()),
                                 unique_name)) {
-    if (!this->isCached()) {
+    if (!this->isCachedNonBlocking()) {
       PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
                         platform::errors::InvalidArgument(
                             "Wrong layout set for Input tensor."));
@@ -100,11 +100,10 @@ class PoolingMKLDNNHandler
       const auto is_test = ctx.Attr<bool>("is_test");
 
       const auto dt = framework::ToMKLDNNDataType(input->type());
-      const auto fmt = input->format();
 
       const auto exclude_padding = ctx.Attr<bool>("exclusive");
 
-      const auto src_md = mkldnn::memory::desc(src_tz, dt, fmt);
+      const auto src_md = mkldnn::memory::desc(src_tz, dt, input->format());
       /* create memory descriptor for pooling without specified format
        * ('any') which lets a primitive (pooling in this case) choose
        * the memory format preferred for best performance
@@ -124,7 +123,7 @@ class PoolingMKLDNNHandler
 
       ComputeAdaptivePoolParameters(ctx, src_tz, &ksize, &strides);
 
-      this->AcquireForwardPrimitiveDescriptor(
+      this->AcquireForwardPrimitiveDescriptorNonBlocking(
           is_test ? mkldnn::prop_kind::forward_inference
                   : mkldnn::prop_kind::forward_training,
           pooling_type == "max"
@@ -200,6 +199,10 @@ class PoolingMKLDNNHandler
       auto diff_dst_tz =
           paddle::framework::vectorize<int64_t>(out_grad->dims());
 
+      const auto dt = framework::ToMKLDNNDataType(in_x->type());
+      auto src_md = mkldnn::memory::desc(src_tz, dt, in_x->format());
+      auto dst_md =
+          mkldnn::memory::desc(diff_dst_tz, dt, MKLDNNMemoryFormat::any);
       auto diff_dst_md = mkldnn::memory::desc(
           diff_dst_tz, platform::MKLDNNGetDataType<T>(), out_grad->format());
       auto diff_src_md =
@@ -216,7 +219,18 @@ class PoolingMKLDNNHandler
       ComputeAdaptivePoolParameters(ctx, diff_src_tz, &ksize, &strides);
 
       const auto exclude_padding = ctx.Attr<bool>("exclusive");
-      this->AcquireBackwardPrimitiveDescriptor(
+
+      this->AcquireForwardPrimitiveDescriptorNonBlocking(
+          mkldnn::prop_kind::forward_training,
+          pooling_type == "max"
+              ? mkldnn::algorithm::pooling_max
+              : (exclude_padding
+                     ? mkldnn::algorithm::pooling_avg_exclude_padding
+                     : mkldnn::algorithm::pooling_avg_include_padding),
+          src_md, dst_md, strides, ksize, mkldnn_paddings[0],
+          mkldnn_paddings[1]);
+
+      this->AcquireBackwardPrimitiveDescriptorNonBlocking(
           pooling_type == "max"
               ? mkldnn::algorithm::pooling_max
               : (exclude_padding
diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e91bbd15cfb7c612719ab3b6cdf2ac439b616ab5
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+
+template <typename T>
+class ScaleMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+
+    bool bias_after_scale = ctx.Attr<bool>("bias_after_scale");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto* scale_tensor = ctx.Input<Tensor>("ScaleTensor");
+
+    float scale = (scale_tensor == nullptr) ? ctx.Attr<float>("scale")
+                                            : (float)*(scale_tensor->data<T>());
+    float bias = ctx.Attr<float>("bias");
+
+    // if bias_after_scale == true
+    //   out = scale*X + bias
+    // else
+    //   out = scale*(X + bias) = scale*X + scale*bias
+
+    if (!bias_after_scale) bias *= scale;
+
+    auto x_tz = framework::vectorize<int64_t>(x->dims());
+    bool is_inplaced = x->IsSharedBufferWith(*out);
+
+    platform::ActivationMKLDNNHandler<T> handler(
+        x_tz, mkldnn::algorithm::eltwise_linear, scale, bias, x->format(),
+        dev_ctx, ctx.GetPlace(), ctx.InputName("X"), is_inplaced);
+
+    auto src_memory_p = handler.AcquireSrcMemory(x);
+    auto dst_memory_p = handler.AcquireDstMemory(out);
+    auto activation_p = handler.AcquireForwardPrimitive();
+
+    auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
+    activation_p->execute(astream, {{MKLDNN_ARG_FROM, *src_memory_p},
+                                    {MKLDNN_ARG_TO, *dst_memory_p}});
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(scale, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ScaleMKLDNNKernel<float>,
+                   ops::ScaleMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 1138d5113929329462a7ea6ccd01f1b7bc375322..1d177e120b59f869d6f9ba96197a973d0ad62d5b 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -15,15 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-namespace platform {
-class MKLDNNDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 
@@ -59,7 +50,7 @@ class SoftmaxMKLDNNHandler
                         : platform::CreateKey(
                               dev_ctx, framework::vectorize(input->dims()),
                               uniq_name)) {
-    if (!this->isCached()) {
+    if (!this->isCachedNonBlocking()) {
       PADDLE_ENFORCE_EQ(
           input->dims(), output->dims(),
           platform::errors::InvalidArgument(
@@ -69,27 +60,41 @@ class SoftmaxMKLDNNHandler
       auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType<T>(),
                              input->format());
 
-      this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md,
-                                              axis);
+      this->AcquireForwardPrimitiveDescriptorNonBlocking(
+          prop_kind::forward_scoring, md, axis);
     }
   }
 
-  SoftmaxMKLDNNHandler(const std::vector<int64_t>& dims,
-                       const MKLDNNMemoryFormat fmt,
-                       const MKLDNNMemoryFormat diff_fmt, const int& axis,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
-                       platform::Place cpu_place, const std::string& uniq_name)
+  SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx,
+                       const MKLDNNDeviceContext& dev_ctx,
+                       platform::Place cpu_place, const Tensor* out,
+                       const Tensor* out_grad, Tensor* in_x_grad,
+                       const std::string& unique_name)
       : platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
                                  mkldnn::softmax_backward>(
             dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dev_ctx, dims, uniq_name)) {
-    auto data_softmax_md =
-        mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), fmt);
-    auto diff_softmax_md =
-        mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), diff_fmt);
-
-    this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md,
-                                             axis);
+            platform::CreateKey(dev_ctx, framework::vectorize(out->dims()),
+                                unique_name)) {
+    if (!this->isBwdCached()) {
+      PADDLE_ENFORCE_EQ(
+          out_grad->dims(), in_x_grad->dims(),
+          platform::errors::InvalidArgument("The shape of softmax_grad's input "
+                                            "and output must be identical."));
+
+      auto dims = out_grad->dims();  // input and output share the same shape
+      const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
+      auto softmax_tz = framework::vectorize<int64_t>(dims);
+
+      auto data_softmax_md = MKLDNNMemDesc(
+          softmax_tz, platform::MKLDNNGetDataType<T>(), out->format());
+      auto diff_softmax_md = MKLDNNMemDesc(
+          softmax_tz, platform::MKLDNNGetDataType<T>(), out_grad->format());
+
+      this->AcquireForwardPrimitiveDescriptorNonBlocking(
+          prop_kind::forward_scoring, data_softmax_md, axis);
+      this->AcquireBackwardPrimitiveDescriptorNonBlocking(
+          diff_softmax_md, data_softmax_md, axis);
+    }
   }
 };
 
@@ -145,27 +150,15 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
                           "Operator DNNL SoftmaxGrad must use CPUPlace"));
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const Tensor* output = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx =
-        ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
-
-    PADDLE_ENFORCE_EQ(
-        dout->dims(), dx->dims(),
-        platform::errors::InvalidArgument(
-            "The shape of softmax_grad's input and output must be identical."));
-
-    auto dims = dout->dims();  // input and output share the same shape
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
-
-    auto softmax_tz = paddle::framework::vectorize<int64_t>(dims);
+    auto* out_grad = ctx.template Input<Tensor>(framework::GradVarName("Out"));
+    auto* in_x_grad = ctx.template Output<Tensor>(framework::GradVarName("X"));
 
-    SoftmaxMKLDNNHandler<T> handler(softmax_tz, output->format(),
-                                    dout->format(), axis, dev_ctx,
-                                    ctx.GetPlace(), ctx.InputName("Out"));
+    SoftmaxMKLDNNHandler<T> handler(ctx, dev_ctx, ctx.GetPlace(), output,
+                                    out_grad, in_x_grad, ctx.InputName("Out"));
 
     auto dst_memory_p = handler.AcquireDstMemory(output);
-    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout);
-    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx);
+    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(out_grad);
+    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(in_x_grad);
 
     auto softmax_bwd_p = handler.AcquireBackwardPrimitive();
 
@@ -176,8 +169,8 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
                             {MKLDNN_ARG_DIFF_SRC, *diff_src_memory_p}});
     astream.wait();
 
-    dx->set_layout(framework::DataLayout::kMKLDNN);
-    dx->set_format(dout->format());
+    in_x_grad->set_layout(framework::DataLayout::kMKLDNN);
+    in_x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory_p));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index a7886cdd670d4d75ec1e5468dd059a8fbea081c6..7536654c5f5ccd7ea18911c9530c5ce42ba9ca3f 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -198,6 +198,13 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) "
                   "Whether to use multi-precision during weight updating.")
         .SetDefault(false);
+    // TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
+    // as dispensable since they are not used when use_global_beta_pow is true.
+    AddAttr<bool>("use_global_beta_pow",
+                  "(bool, default false) "
+                  "Whether to use global beta_pow for whole model instead of "
+                  "creating beta_pow for each parameter.")
+        .SetDefault(false);
 
     AddComment(R"DOC(
 Adam Optimizer.
@@ -246,4 +253,16 @@ REGISTER_OP_VERSION(adam)
             "EpsilonTensor",
             "If provided, Adam will use this as epsilon, "
             "this has a higher priority than attr(epsilon). "
-            "For better performance in npu kernel. "));
+            "For better performance in npu kernel. "))
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade adam, add 1 attribute [use_global_beta_pow].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "use_global_beta_pow",
+            "If true, Adam will use global beta_pow for whole model "
+            "instead of creating beta_pow for each parameter."
+            "In that case, the outputs(Beta1PowOut, Beta2PowOut) will not be "
+            "used in adam op, "
+            "and beta_pow will be updated after all adam op in the model.",
+            false));
diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
index 3d6f0f99a52dfbfd917ef2d8ef4a3d2524edb8cc..2ee2a08bf3bc63c34e18e668e9875d6ef6132951 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -154,6 +154,8 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
     int64_t min_row_size_to_use_multithread =
         ctx.Attr<int64_t>("min_row_size_to_use_multithread");
     bool lazy_mode = ctx.Attr<bool>("lazy_mode");
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
 
     auto* param = ctx.Input<LoDTensor>("Param");
     auto* grad_var = ctx.InputVar("Grad");
@@ -254,11 +256,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
             lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
             param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
             master_out_data, param->numel());
-        // Cpu update
-        beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
-            beta1 * beta1_pow->data<MPDType>()[0];
-        beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
-            beta2 * beta2_pow->data<MPDType>()[0];
+        if (!use_global_beta_pow) {
+          // Cpu update
+          beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta1 * beta1_pow->data<MPDType>()[0];
+          beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta2 * beta2_pow->data<MPDType>()[0];
+        }
       } else {
         AdamKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
             beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
@@ -269,14 +273,15 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
             lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
             param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
             master_out_data, param->numel());
-        // Update with gpu
-        UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
-            beta1, beta2, beta1_pow->data<MPDType>(),
-            beta2_pow->data<MPDType>(),
-            beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
-            beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
+        if (!use_global_beta_pow) {
+          // Update with gpu
+          UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
+              beta1, beta2, beta1_pow->data<MPDType>(),
+              beta2_pow->data<MPDType>(),
+              beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
+              beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
+        }
       }
-
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto* grad = ctx.Input<framework::SelectedRows>("Grad");
       if (grad->rows().size() == 0) {
@@ -328,11 +333,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
             param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
             master_out_data, rows, row_numel, grad_merge.rows().size(),
             lazy_mode, ndim);
-        // Update with cpu
-        beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
-            beta1 * beta1_pow->data<MPDType>()[0];
-        beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
-            beta2 * beta2_pow->data<MPDType>()[0];
+        if (!use_global_beta_pow) {
+          // Update with cpu
+          beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta1 * beta1_pow->data<MPDType>()[0];
+          beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta2 * beta2_pow->data<MPDType>()[0];
+        }
       } else {
         SparseAdamFunctor<T, GPUAdam, MPDType> functor(
             beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
@@ -351,12 +358,14 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
                 ctx.device_context()),
             param->numel());
         for_range(functor);
-        // update beta1 and beta2
-        UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
-            beta1, beta2, beta1_pow->data<MPDType>(),
-            beta2_pow->data<MPDType>(),
-            beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
-            beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
+        if (!use_global_beta_pow) {
+          // update beta1 and beta2
+          UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
+              beta1, beta2, beta1_pow->data<MPDType>(),
+              beta2_pow->data<MPDType>(),
+              beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
+              beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
+        }
       }
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 9667db8055b90cd5bf47060dc2cefe5e9b8477fe..bbd4179d84d896d16a6d7e0c8a4fcfbdf039a71d 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -406,6 +406,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
     int64_t min_row_size_to_use_multithread =
         ctx.Attr<int64_t>("min_row_size_to_use_multithread");
     bool lazy_mode = ctx.Attr<bool>("lazy_mode");
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
 
     auto* param = ctx.Input<LoDTensor>("Param");
     auto* grad_var = ctx.InputVar("Grad");
@@ -475,11 +477,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
           lr->data<T>(), grad->data<T>(), param->data<T>(),
           param_out->mutable_data<T>(ctx.GetPlace()));
       functor(param->numel());
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-          beta1 * beta1_pow->data<T>()[0];
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-          beta2 * beta2_pow->data<T>()[0];
-
+      if (!use_global_beta_pow) {
+        beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+            beta1 * beta1_pow->data<T>()[0];
+        beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+            beta2 * beta2_pow->data<T>()[0];
+      }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto* grad = ctx.Input<framework::SelectedRows>("Grad");
       if (grad->rows().size() == 0) {
@@ -523,10 +526,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
           param_out->mutable_data<T>(ctx.GetPlace()), rows, row_numel,
           grad_merge.rows().size(), lazy_mode);
       // update beta1 and beta2
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-          beta1 * beta1_pow->data<T>()[0];
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-          beta2 * beta2_pow->data<T>()[0];
+      if (!use_global_beta_pow) {
+        beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+            beta1 * beta1_pow->data<T>()[0];
+        beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+            beta2 * beta2_pow->data<T>()[0];
+      }
       if (lazy_mode) {
         VLOG(3) << "run cpu lazy mode";
         size_t row_count = grad_merge.rows().size();
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
index 806e0fda07b592206ff35a463e6e9030d3a963c3..e5fe7f20a42e0b869bdefe34a683b640c0a108f4 100644
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -49,8 +49,8 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     auto* mom2 = ctx.Input<LoDTensor>("Moment2");
     auto* lr = ctx.Input<LoDTensor>("LearningRate");
 
-    auto* beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
-    auto* beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
+    auto* beta1_pow = ctx.Input<Tensor>("Beta1Pow");
+    auto* beta2_pow = ctx.Input<Tensor>("Beta2Pow");
 
     auto* param_out = ctx.Output<LoDTensor>("ParamOut");
     auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
@@ -58,25 +58,28 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
     auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
 
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
     param_out->mutable_data<T>(ctx.GetPlace());
     mom1_out->mutable_data<T>(ctx.GetPlace());
     mom2_out->mutable_data<T>(ctx.GetPlace());
 
-    // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place.
+    // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform
+    // place.
+    LoDTensor beta1_pow_tmp;
+    LoDTensor beta2_pow_tmp;
     if (beta1_pow->place() == platform::CPUPlace()) {
       T beta1 = *beta1_pow->data<T>();
-      // `mutable_data` operation needs to be done after getting data
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(beta1_pow_out, beta1);
-    } else {
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+      beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&beta1_pow_tmp, beta1);
+      beta1_pow = &beta1_pow_tmp;
     }
     if (beta2_pow->place() == platform::CPUPlace()) {
       T beta2 = *beta2_pow->data<T>();
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(beta2_pow_out, beta2);
-    } else {
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+      beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&beta2_pow_tmp, beta2);
+      beta2_pow = &beta2_pow_tmp;
     }
 
     const Tensor* beta1_tensor = nullptr;
@@ -173,12 +176,16 @@ class AdamNPUKernel : public framework::OpKernel<T> {
           *mom2, ctx.GetPlace(),
           ctx.template device_context<platform::DeviceContext>(), mom2_out);
     }
-    auto runner_m1 =
-        NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
-    runner_m1.Run(stream);
-    auto runner_m2 =
-        NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
-    runner_m2.Run(stream);
+    if (!use_global_beta_pow) {
+      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+      auto runner_m1 =
+          NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
+      runner_m1.Run(stream);
+      auto runner_m2 =
+          NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
+      runner_m2.Run(stream);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
index 09f117374499b07dfc4a227d2c22e8fa23c5d1f6..0f5706e428e15454e216af8e1067d31720cbf7c7 100644
--- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -73,6 +73,9 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
                           "value is:%d.",
                           beta2_pow_out->numel()));
 
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
     T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
     if (ctx.HasInput("Beta1Tensor")) {
       auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
@@ -111,45 +114,48 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
           mom1_out.template mutable_data<T>(ctx.GetPlace()),
           mom2_out.template mutable_data<T>(ctx.GetPlace()),
           param_out.template mutable_data<T>(ctx.GetPlace()), param.numel());
-
-      // update in cpu and then copy to xpu
-      if (beta1_pow.place() == platform::CPUPlace() &&
-          beta2_pow.place() == platform::CPUPlace()) {
-        const T* beta1_pow_p = beta1_pow.template data<T>();
-        beta1_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
-            beta1 * beta1_pow_p[0];
-        const T* beta2_pow_p = beta2_pow.template data<T>();
-        beta2_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
-            beta2 * beta2_pow_p[0];
-      } else {
-        T cpu_beta1_pow_out_data;
-        T cpu_beta2_pow_out_data;
-        memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
-                     BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
-                     beta1_pow_ptr, sizeof(T));
-
-        cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
-        memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
-                     BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
-                     beta2_pow_ptr, sizeof(T));
-
-        cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;
-
-        T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
-        T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
-                     beta1_pow_out_p, platform::CPUPlace(),
-                     &cpu_beta1_pow_out_data, sizeof(T));
-        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
-                     beta2_pow_out_p, platform::CPUPlace(),
-                     &cpu_beta2_pow_out_data, sizeof(T));
+      if (!use_global_beta_pow) {
+        // update in cpu and then copy to xpu
+        if (beta1_pow.place() == platform::CPUPlace() &&
+            beta2_pow.place() == platform::CPUPlace()) {
+          const T* beta1_pow_p = beta1_pow.template data<T>();
+          beta1_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
+              beta1 * beta1_pow_p[0];
+          const T* beta2_pow_p = beta2_pow.template data<T>();
+          beta2_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
+              beta2 * beta2_pow_p[0];
+
+        } else {
+          T cpu_beta1_pow_out_data;
+          T cpu_beta2_pow_out_data;
+
+          memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
+                       BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
+                       beta1_pow_ptr, sizeof(T));
+
+          cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
+          memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
+                       BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
+                       beta2_pow_ptr, sizeof(T));
+
+          cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;
+
+          T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+          T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+          memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                       beta1_pow_out_p, platform::CPUPlace(),
+                       &cpu_beta1_pow_out_data, sizeof(T));
+          memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                       beta2_pow_out_p, platform::CPUPlace(),
+                       &cpu_beta2_pow_out_data, sizeof(T));
+        }
+
+        PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
+                          platform::errors::External(
+                              "XPU API return wrong value[%d], please check "
+                              "where Baidu Kunlun Card is properly installed.",
+                              r));
       }
-
-      PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
-                        platform::errors::External(
-                            "XPU API return wrong value[%d], please check "
-                            "where Baidu Kunlun Card is properly installed.",
-                            r));
     } else {
       PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
                                   "Variable type not supported by adam_op"));
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index ee111a0ec7c0997d9d0380cd4be0c60683b0d3b1..0ebfb2f1bcd2203f987cdc656f0142eff4e009d2 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -59,16 +59,6 @@ HOSTDEVICE inline void StridedMemcpy(const T* x, const size_t* x_dims, T* out,
   size_t offset_i = offsets[i];
 
   if (i == rank - 1) {
-    PADDLE_ENFORCE(x_stride == 1,
-                   "When i:%d == rank:%d - 1, x_stride of random_crop_op "
-                   "expected to be 1, but got %ld. Please check input "
-                   "value.",
-                   i, rank, x_stride);
-    PADDLE_ENFORCE(out_stride == 1,
-                   "When i:%d == rank:%d - 1, out_stride of random_crop_op "
-                   "expected to be 1, but got %ld. Please check input "
-                   "value.",
-                   i, rank, out_stride);
     x += offset_i;
     for (size_t j = 0; j < out_dim_i; ++j) {
       *out++ = *x++;
diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc
index 5f667999ee613961c44195836bcd36b0530a5c36..1174e72a76b1bb5aa744b964e289f0ac9c66596c 100644
--- a/paddle/fluid/operators/real_op.cc
+++ b/paddle/fluid/operators/real_op.cc
@@ -95,11 +95,11 @@ REGISTER_OPERATOR(real, ops::RealOp, ops::RealOpMaker,
 REGISTER_OPERATOR(real_grad, ops::RealGradOp);
 
 REGISTER_OP_CPU_KERNEL(real, ops::RealKernel<paddle::platform::CPUDeviceContext,
-                                             paddle::platform::complex64>,
+                                             paddle::platform::complex<float>>,
                        ops::RealKernel<paddle::platform::CPUDeviceContext,
-                                       paddle::platform::complex128>);
+                                       paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(real_grad,
                        ops::RealGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex64>,
+                                           paddle::platform::complex<float>>,
                        ops::RealGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex128>);
+                                           paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/real_op.cu b/paddle/fluid/operators/real_op.cu
index b3d0855111b72f3eba4d9e737b4b650042f7238a..9bfb2878a6261bb5c69a1fb543e5aa15a87c5a8f 100644
--- a/paddle/fluid/operators/real_op.cu
+++ b/paddle/fluid/operators/real_op.cu
@@ -18,11 +18,11 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(real,
                         ops::RealKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
                         ops::RealKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(real_grad,
                         ops::RealGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex64>,
+                                            paddle::platform::complex<float>>,
                         ops::RealGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex128>);
+                                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cc8ac200b8eec1505177ce752ed8f103908f46a
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class XPULogsumexpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto axis = context.Attr<std::vector<int>>("axis");
+    auto reduce_all = context.Attr<bool>("reduce_all");
+
+    const auto& input_dim_size = input->dims().size();
+    // The dims has full dim, set the reduce_all is True
+    reduce_all |= (static_cast<const int>(axis.size()) == input_dim_size);
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    std::vector<int> axis_shape;
+    std::vector<int> xdims(input_dim_size);
+    for (int i = 0; i < input_dim_size; ++i) {
+      xdims[i] = input->dims()[i];
+    }
+    if (reduce_all) {
+      for (int i = 0; i < input_dim_size; ++i) {
+        axis_shape.push_back(i);
+      }
+    } else {
+      for (size_t i = 0; i < axis.size(); ++i) {
+        int rdim = axis[i] < 0 ? axis[i] + input_dim_size : axis[i];
+        axis_shape.push_back(rdim);
+      }
+    }
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::logsumexp<T>(dev_ctx.x_context(), input_data, output_data,
+                              xdims, axis_shape);
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::External(
+                          "XPU logsumexp kernel error! error value[%d %]", r,
+                          XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    logsumexp,
+    ops::XPULogsumexpKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4ea18edb2a95033cf99849a8b9f84e98203266c
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct CustomMin {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return (b < a) ? b : a;
+  }
+};
+
+template <typename T>
+struct CustomMax {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return (b > a) ? b : a;
+  }
+};
+
+template <typename T>
+struct CustomSum {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b + a;
+  }
+};
+
+template <typename T>
+struct CustomMul {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b * a;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index fdb2c57385b2bc1068c618f206bfeb6513d3d8c4..c8d568c8c2cf73041549a138085b72b41c0c297a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -100,6 +100,8 @@ REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp,
                   ops::ReduceMeanDoubleGradOpBaseMaker,
                   ops::ReduceMeanGradNoNeedBufferVarInferer);
 REGISTER_OP_CPU_KERNEL(reduce_mean,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         bool, ops::MeanFunctor>,
                        ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                          float, ops::MeanFunctor>,
                        ops::ReduceKernel<paddle::platform::CPUDeviceContext,
@@ -110,5 +112,6 @@ using CPUReduceMeanGradKernel =
     ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, T,
                           ops::MeanGradFunctor, true>;
 
-REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel<float>,
+REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel<bool>,
+                       CPUReduceMeanGradKernel<float>,
                        CPUReduceMeanGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
index cc3653fcb43a4c000d0c61c9d854965fafd59a9c..50d2fcdee23bd9e830f32e0cff4d367c3ad5ba66 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
@@ -65,5 +65,6 @@ class ReduceMeanKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
+REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<bool>,
+                        ops::ReduceMeanKernel<float>,
                         ops::ReduceMeanKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
index 289f574719ff03b1b09f313d05bab152f5c5d651..0e133d5447f93b8891c6de4cb5ad40ac7825493b 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
@@ -20,5 +20,6 @@ using CUDAReduceMeanGradKernel =
     ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T,
                           ops::MeanGradFunctor, true>;
 
-REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<float>,
+REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<bool>,
+                        CUDAReduceMeanGradKernel<float>,
                         CUDAReduceMeanGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..91d7fb7c8439a2e9f0e3bf5257347c7b928cd3e7
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
@@ -0,0 +1,646 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <set>
+#include <vector>
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/framework/array.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+// Post processing function for sum, max, min, prod, any
+template <typename T>
+struct IdentityFunctor {
+  DEVICE explicit inline IdentityFunctor() {}
+
+  DEVICE inline T operator()(const T& x) const { return x; }
+};
+
+// Post processing function for mean
+template <typename T>
+struct DivideFunctor {
+  DEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
+
+  DEVICE inline T operator()(const T& x) const { return x * n_inv; }
+
+ private:
+  T n_inv;
+};
+
+static inline int GetLastPow2(int n) {
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+  return std::max(1, n - (n >> 1));
+}
+
+static inline std::vector<int> GetStrides(const std::vector<int>& dims,
+                                          const std::vector<int>& idx) {
+  int n = static_cast<int>(idx.size());
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
+  strides.back() = 1;
+  for (int i = n - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[idx[i + 1]];
+  }
+  return strides;
+}
+
+#ifdef __HIPCC__
+constexpr int kMaxBlockDim = 256;
+#else
+constexpr int kMaxBlockDim = 512;
+#endif
+
+static inline int GetDesiredBlockDim(int block_dim) {
+  return block_dim >= kMaxBlockDim
+             ? kMaxBlockDim
+             : (1 << static_cast<int>(std::log2(block_dim)));
+}
+
+static inline void CheckReduceRankIsValid(int reduce_rank, int rank) {
+  if (rank % 2 == 0) {
+    PADDLE_ENFORCE_EQ(reduce_rank, rank / 2,
+                      platform::errors::InvalidArgument(
+                          "ReduceOp: invalid reduce rank. When rank = %d, "
+                          "reduce_rank must be %d, but got %d.",
+                          rank, rank / 2, reduce_rank));
+  } else {
+    auto lower_rank = (rank - 1) / 2;
+    auto upper_rank = (rank + 1) / 2;
+    PADDLE_ENFORCE_EQ(
+        reduce_rank == lower_rank || reduce_rank == upper_rank, true,
+        platform::errors::InvalidArgument(
+            "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank "
+            "must be %d or %d, but got %d.",
+            rank, lower_rank, upper_rank, reduce_rank));
+  }
+}
+
+template <typename T, size_t ElementCount, typename VectorLikeType>
+static inline paddle::framework::Array<T, ElementCount> from(
+    const VectorLikeType& vec) {
+  PADDLE_ENFORCE_EQ(vec.size(), ElementCount,
+                    platform::errors::InvalidArgument(
+                        "Cub reduce Array: size not match. Received "
+                        "vec.size() %d !=  ElementCount %d.",
+                        vec.size(), ElementCount));
+  size_t n = static_cast<size_t>(vec.size());
+  paddle::framework::Array<T, ElementCount> ret;
+  for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
+  return ret;
+}
+
+}  // namespace detail
+
+enum ReduceType {
+  kReduceAll = 0x00,
+  kReduceLastDim = 0x01,
+  kReduceHigherDim = 0x02,  // ReduceFirstDim or reduceSecondDim
+  kReduceAny = 0x03,
+};
+
+// reduce config
+template <typename Ty>
+struct ReduceConfig {
+  ReduceConfig(std::vector<int> origin_reduce_dims, std::vector<int> x_dim)
+      : reduce_dims_origin(origin_reduce_dims), x_dim(x_dim) {}
+
+  // get the parameters of reduceKernel
+  void Run() {
+    // step1: update the reduce_dim left_dim and x_dim
+    SetReduceDim();
+    // step2: get the strides of dim for reduceAny and reduceLastDim
+    SetStrides();
+    // step3: get the type of reduce
+    SetReduceType();
+    // step4: set the block and grid for launch kernel
+    SetBlockDim();
+  }
+
+  // when should_reduce_again is true, we need malloc temp space for temp data
+  void SetOutputData(Ty* y_data, const platform::Place& place,
+                     framework::Tensor& tmp) {
+    if (should_reduce_again) {
+      output_data = tmp.mutable_data<Ty>(
+          framework::make_ddim(
+              {static_cast<int64_t>(left_num * grid.y * sizeof(Ty))}),
+          place);
+    } else {
+      output_data = y_data;
+    }
+  }
+
+ private:
+  // set reduce_dim, left_dim and update x_dim
+  // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1]
+  //     --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1]
+  void SetReduceDim() {
+    std::set<int> reduce_set;
+
+    for (auto e : reduce_dims_origin) {
+      auto pos = e >= 0 ? e : e + x_dim.size();
+      reduce_set.insert(pos);
+    }
+    std::vector<int> reduce_dim_temp(reduce_set.begin(), reduce_set.end());
+    std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end());
+    // get reduce_dim
+    if (reduce_dim_temp.size() > 1) {
+      int num = 0;  // for update axis
+      reduce_dim.push_back(reduce_dim_temp[0]);
+      for (int idx = 1; idx < reduce_dim_temp.size(); idx++) {
+        // update x_dim
+        if (reduce_dim_temp[idx] - reduce_dim_temp[idx - 1] == 1) {
+          x_dim[reduce_dim_temp[idx - 1]] *= x_dim[reduce_dim_temp[idx]];
+          x_dim.erase(x_dim.begin() + reduce_dim_temp[idx]);
+          num++;
+        } else {
+          reduce_dim.push_back(reduce_dim_temp[idx] - num);
+        }
+      }
+    } else {
+      reduce_dim = reduce_dim_temp;
+    }
+
+    // update new_x_dim and new_reduce_dim
+    std::vector<int> new_x_dim, new_reduce_dim_temp;
+    int is_reduced = 0;
+    for (auto e : reduce_dim) {
+      is_reduced |= 1 << e;
+    }
+
+    for (int i = 0; i < x_dim.size(); i++) {
+      if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
+        new_x_dim.push_back(x_dim[i]);
+        if ((is_reduced >> i) & 1)
+          new_reduce_dim_temp.push_back(new_x_dim.size() - 1);
+      } else {
+        new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
+      }
+    }
+
+    x_dim = new_x_dim;
+    reduce_dim = new_reduce_dim_temp;
+
+    int x_rank = static_cast<int>(x_dim.size());
+    std::set<int> left_set;
+
+    for (int i = 0; i < x_rank; ++i) {
+      left_set.insert(i);
+    }
+
+    for (auto e : reduce_dim) {
+      left_set.erase(e);
+    }
+
+    left_dim.assign(left_set.begin(), left_set.end());
+  }
+
+  // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny
+  // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1]
+  //     --SetStrides--> x_strides= [6,1], reduce_strides = [1],
+  //     left_strides = [1]
+  void SetStrides() {
+    std::vector<int> idx_dim;
+    for (int i = 0; i < x_dim.size(); i++) {
+      idx_dim.push_back(i);
+    }
+
+    x_strides = detail::GetStrides(x_dim, idx_dim);
+    reduce_strides = detail::GetStrides(x_dim, reduce_dim);
+    left_strides = detail::GetStrides(x_dim, left_dim);
+    reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
+
+    left_num = 1;
+    if (left_dim.size()) {
+      left_num = left_strides[0] * x_dim[left_dim[0]];
+    }
+  }
+
+  // get the reduceType
+  // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim
+  //     x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim
+  //     x_dim = [8] reduce_dim = [0] --> reduceAll
+  //     x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny
+  void SetReduceType() {
+    int rank = x_dim.size();
+    int reduce_rank = reduce_dim.size();
+
+    if (rank == reduce_rank) {
+      reduce_type = static_cast<int>(ReduceType::kReduceAll);
+
+    } else if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
+      reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
+    } else if (reduce_rank == 1) {
+      // ReduceFirstDim and reduceSecondDim
+      reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
+
+    } else {
+      reduce_type = static_cast<int>(ReduceType::kReduceAny);
+    }
+  }
+
+  // set block and grid for launch kernel
+  // for ReduceHigherDim: if block is enough -> splite reduce_num
+  //                     else init block(32, 1) grid(block_num, 1)
+  // for others: block(block_num, 1) , grid(left_num, 1)
+  void SetBlockDim() {
+    // init
+    int block_num = detail::GetDesiredBlockDim(reduce_num);
+    should_reduce_again = false;
+
+    dim3 block_dim(block_num, 1);
+    dim3 grid_dim(left_num, 1);
+    blocking_size = reduce_num;
+
+    if (reduce_type == ReduceType::kReduceHigherDim) {
+      int last_dim_num = x_dim.back();
+      // update left_num
+      int grid_z = left_num / last_dim_num;
+      left_num = last_dim_num;
+
+      block_dim.z = 1;
+      grid_dim.z = grid_z;
+
+      int device_id = platform::GetCurrentDeviceId();
+      int max_mp = platform::GetCUDAMultiProcessors(device_id);
+      int max_threads_per_mp =
+          platform::GetCUDAMaxThreadsPerMultiProcessor(device_id);
+      int max_threads = max_threads_per_mp * max_mp;
+
+      // init
+      int num_block = (max_threads / left_num);
+
+      if (num_block > 1 && reduce_num >= 512) {
+        blocking_size = detail::GetLastPow2(reduce_num / num_block);
+
+        if (blocking_size <= 1) {
+          blocking_size = detail::GetLastPow2(sqrt(reduce_num));
+        } else if (blocking_size * 2 < reduce_num) {
+          blocking_size *= 2;
+        }
+
+        should_reduce_again = true;
+
+        block_dim.x = 32;
+        block_dim.y = 1;
+        grid_dim.x = (left_num + block_dim.x - 1) / block_dim.x;
+        grid_dim.y = (reduce_num + blocking_size - 1) / blocking_size;
+
+      } else {
+        block_dim.x = 32;
+        block_dim.y = 1;
+        blocking_size = reduce_num;
+        grid_dim.x = (left_num + block_dim.x - 1) / block_dim.x;
+        grid_dim.y = 1;
+      }
+    }
+
+    block = block_dim;
+    grid = grid_dim;
+  }
+
+ public:
+  std::vector<int> reduce_dims_origin;
+  std::vector<int> reduce_dim;
+  std::vector<int> x_dim;
+  std::vector<int> left_dim;
+  std::vector<int> x_strides;
+  std::vector<int> left_strides;
+  std::vector<int> reduce_strides;
+
+  int reduce_type;
+  int reduce_num;
+  int left_num;
+  int blocking_size;
+  bool should_reduce_again;
+
+  Ty* output_data;
+
+  dim3 block;
+  dim3 grid;
+};
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim>
+__device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y,
+                                              ReduceOp reducer,
+                                              TransformOp transformer, Ty init,
+                                              int reduce_num) {
+  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
+  int idx_x = blockIdx.x * reduce_num;
+  int idx_y = threadIdx.x;
+  Ty reduce_var = init;
+  for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
+    reduce_var = reducer(reduce_var, static_cast<Ty>(x[idx_x + idx_y]));
+  __syncthreads();
+
+  reduce_var =
+      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
+
+  if (threadIdx.x == 0) {
+    y[blockIdx.x] = transformer(reduce_var);
+  }
+}
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
+__device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y,
+                                                ReduceOp reducer,
+                                                TransformOp transformer,
+                                                Ty init, int reduce_num,
+                                                int left_num, int block_size) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int idy = blockIdx.y * block_size;
+
+  Ty temp = init;
+  Ty reduce_var = init;
+
+  if (idx < left_num) {
+    int loop = reduce_num - idy;
+    loop = loop > block_size ? block_size : loop;
+    for (int iy = 0; iy < loop; iy++) {
+      int id = (idy + iy) * left_num + idx + blockIdx.z * reduce_num * left_num;
+      reduce_var = reducer(reduce_var, static_cast<Ty>(x[id]));
+    }
+    y[idx + blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num] =
+        static_cast<Ty>(transformer(reduce_var));
+  }
+}
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim, int Rank, int ReduceRank>
+__device__ __forceinline__ void ReduceAny(
+    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
+    int reduce_num, paddle::framework::Array<int, Rank> x_strides,
+    paddle::framework::Array<int, ReduceRank> reduce_dim,
+    paddle::framework::Array<int, ReduceRank> reduce_strides,
+    paddle::framework::Array<int, Rank - ReduceRank> left_dim,
+    paddle::framework::Array<int, Rank - ReduceRank> left_strides) {
+  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
+
+  int sub_index[Rank];
+  int left_idx = blockIdx.x;
+  for (int i = 0; i < Rank - ReduceRank; ++i) {
+    sub_index[left_dim[i]] = left_idx / left_strides[i];
+    left_idx %= left_strides[i];
+  }
+
+  int reduce_idx = threadIdx.x;
+  for (int j = 0; j < ReduceRank; ++j) {
+    sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
+    reduce_idx %= reduce_strides[j];
+  }
+
+  int idx_x = 0;
+  for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+  Ty reduce_var = static_cast<Ty>(x[idx_x]);
+
+  for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
+    int reduce_idx = i;
+    for (int j = 0; j < ReduceRank; ++j) {
+      sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
+      reduce_idx %= reduce_strides[j];
+    }
+
+    int idx_x = 0;
+    for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+    reduce_var =
+        static_cast<Ty>(reducer(reduce_var, static_cast<Ty>(x[idx_x])));
+  }
+  __syncthreads();
+
+  reduce_var =
+      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
+
+  if (threadIdx.x == 0) {
+    y[blockIdx.x] = transformer(reduce_var);
+  }
+}
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim, int Rank, int ReduceRank, int ReduceType>
+__device__ __forceinline__ void ReduceModule(
+    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
+    int reduce_num, int left_num, int blocking_size,
+    paddle::framework::Array<int, Rank> x_strides,
+    paddle::framework::Array<int, ReduceRank> reduce_dim,
+    paddle::framework::Array<int, ReduceRank> reduce_strides,
+    paddle::framework::Array<int, Rank - ReduceRank> left_dim,
+    paddle::framework::Array<int, Rank - ReduceRank> left_strides) {
+  if (ReduceType == ReduceType::kReduceLastDim) {
+    ReduceLastDim<Tx, Ty, ReduceOp, TransformOp, BlockDim>(
+        x, y, reducer, transformer, init, reduce_num);
+
+  } else if (ReduceType == ReduceType::kReduceHigherDim) {
+    ReduceHigherDim<Tx, Ty, ReduceOp, TransformOp>(
+        x, y, reducer, transformer, init, reduce_num, left_num, blocking_size);
+
+  } else {
+    ReduceAny<Tx, Ty, ReduceOp, TransformOp, BlockDim, Rank, ReduceRank>(
+        x, y, reducer, transformer, init, reduce_num, x_strides, reduce_dim,
+        reduce_strides, left_dim, left_strides);
+  }
+}
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim, int Rank, int ReduceRank, int ReduceType>
+__global__ void ReduceKernelFunction(
+    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
+    int reduce_num, int left_num, int block_size,
+    paddle::framework::Array<int, Rank> x_strides,
+    paddle::framework::Array<int, ReduceRank> reduce_dim,
+    paddle::framework::Array<int, ReduceRank> reduce_strides,
+    paddle::framework::Array<int, Rank - ReduceRank> left_dim,
+    paddle::framework::Array<int, Rank - ReduceRank> left_strides) {
+  ReduceModule<Tx, Ty, ReduceOp, TransformOp, BlockDim, Rank, ReduceRank,
+               ReduceType>(x, y, reducer, transformer, init, reduce_num,
+                           left_num, block_size, x_strides, reduce_dim,
+                           reduce_strides, left_dim, left_strides);
+}
+
+template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
+          typename TransformOp, int kRank, int kReduceRank>
+static void launchKernel(const Tx* x_data, Ty* y_data,
+                         const platform::Place& place, const ReduceOp& reducer,
+                         const TransformOp& transformer, const Ty& init,
+                         gpuStream_t stream, ReduceConfig<Ty> config) {
+#define CUB_REDUCE_TYPE_CASE(type)                                    \
+  case type: {                                                        \
+    constexpr auto kReduceType = type;                                \
+    ReduceKernelFunction<                                             \
+        Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, kReduceRank,  \
+        kReduceType><<<config.grid, config.block, 0, stream>>>(       \
+        x_data, config.output_data, reducer, transformer, init,       \
+        config.reduce_num, config.left_num, config.blocking_size,     \
+        detail::from<int, kRank>(config.x_strides),                   \
+        detail::from<int, kReduceRank>(config.reduce_dim),            \
+        detail::from<int, kReduceRank>(config.reduce_strides),        \
+        detail::from<int, kRank - kReduceRank>(config.left_dim),      \
+        detail::from<int, kRank - kReduceRank>(config.left_strides)); \
+  } break
+
+  switch (config.reduce_type) {
+    CUB_REDUCE_TYPE_CASE(1);  // reduceLastDim
+    CUB_REDUCE_TYPE_CASE(2);  // ReduceHigherDim
+    CUB_REDUCE_TYPE_CASE(3);  // reduceAny
+  }
+
+  if (config.should_reduce_again) {
+    dim3 block(config.block.x, 1, 1);
+    dim3 grid(config.grid.x, 1, config.grid.z);
+
+    ReduceKernelFunction<
+        Ty, Ty, ReduceOp, detail::IdentityFunctor<Ty>, 128, kRank, kReduceRank,
+        ReduceType::kReduceHigherDim><<<grid, block, 0, stream>>>(
+        config.output_data, y_data, reducer, detail::IdentityFunctor<Ty>(),
+        init, config.grid.y, config.left_num, config.grid.y,
+        detail::from<int, kRank>(config.x_strides),
+        detail::from<int, kReduceRank>(config.reduce_dim),
+        detail::from<int, kReduceRank>(config.reduce_strides),
+        detail::from<int, kRank - kReduceRank>(config.left_dim),
+        detail::from<int, kRank - kReduceRank>(config.left_strides));
+  }
+}
+
+template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
+          typename TransformOp>
+static void launchReduceKernel(const Tx* x_data, Ty* y_data,
+                               const platform::Place& place,
+                               const ReduceOp& reducer,
+                               const TransformOp& transformer, const Ty& init,
+                               gpuStream_t stream, ReduceConfig<Ty> config) {
+  int reduce_rank = config.reduce_strides.size();
+  int rank = config.x_strides.size();
+
+#define CUB_RANK_CASE(i, ...)             \
+  case i: {                               \
+    constexpr auto kRank = i;             \
+    switch (reduce_rank) { __VA_ARGS__; } \
+  } break
+
+#define CUB_REDUCE_RANK_CASE(i, ...)                                           \
+  case i: {                                                                    \
+    constexpr auto kReduceRank = i;                                            \
+    launchKernel<Tx, Ty, BlockDim, ReduceOp, TransformOp, kRank, kReduceRank>( \
+        x_data, y_data, place, reducer, transformer, init, stream, config);    \
+  } break
+
+  // launch CUB::Reduce
+  if (config.reduce_type == static_cast<int>(ReduceType::kReduceAll)) {
+    cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
+        x_data, transformer);
+    size_t temp_storage_bytes = 0;
+    cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
+                              config.reduce_num, reducer, init, stream);
+    framework::Tensor tmp;
+    auto* temp_storage = tmp.mutable_data<uint8_t>(
+        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
+        place);
+    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
+                              config.reduce_num, reducer, init, stream);
+
+    return;
+  }
+
+  detail::CheckReduceRankIsValid(reduce_rank, rank);
+  switch (rank) {
+    CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
+
+    CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2););
+
+    CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(2););
+
+    CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(2); CUB_REDUCE_RANK_CASE(3););
+
+    CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(3););
+
+    CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4););
+
+    CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(4););
+
+    CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(4); CUB_REDUCE_RANK_CASE(5););
+  }
+
+#undef CUB_REDUCE_RANK_CASE
+#undef CUB_RANK_CASE
+}
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
+void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
+                      std::vector<int> origin_reduce_dims, const Ty& init,
+                      const ReduceOp& reducer, const TransformOp& transformer,
+                      gpuStream_t stream) {
+  auto x_dim = framework::vectorize<int>(x.dims());
+  auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
+  config.Run();
+
+  auto x_data = x.data<Tx>();
+  auto y_data = y->mutable_data<Ty>(x.place());
+
+  framework::Tensor tmp;
+  // SetOutputData for ReduceHigherDim when should_reduce_again is true,
+  //   temp_output should be stored temp_data in output_data space or stored in
+  //   y_data;
+  config.SetOutputData(y_data, x.place(), tmp);
+
+  if (config.reduce_num == 1) {
+    auto out_dims = y->dims();
+    framework::TensorCopy(x, y->place(), y);
+    y->Resize(out_dims);
+    return;
+  }
+
+#define CUB_BLOCK_DIM_CASE(block_dim)                                  \
+  case block_dim: {                                                    \
+    constexpr auto kBlockDim = block_dim;                              \
+    launchReduceKernel<Tx, Ty, block_dim, ReduceOp, TransformOp>(      \
+        x_data, y_data, x.place(), reducer, transformer, init, stream, \
+        config);                                                       \
+  } break
+
+  switch (detail::GetDesiredBlockDim(config.reduce_num)) {
+    CUB_BLOCK_DIM_CASE(512);
+    CUB_BLOCK_DIM_CASE(256);
+    CUB_BLOCK_DIM_CASE(128);
+    CUB_BLOCK_DIM_CASE(64);
+    CUB_BLOCK_DIM_CASE(32);
+    CUB_BLOCK_DIM_CASE(16);
+    CUB_BLOCK_DIM_CASE(8);
+    CUB_BLOCK_DIM_CASE(4);
+    CUB_BLOCK_DIM_CASE(2);
+  }
+#undef CUB_BLOCK_DIM_CASE
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 46a1fe6af93fd9e0a16e82111dd2095cea943890..8c1d209bdb682222c35d807586c65fe49c2466e9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -111,8 +111,10 @@ REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
                   ops::ReduceSumGradNoNeedBufferVarInferer);
 
 REGISTER_OP_CPU_KERNEL(
-    reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+    reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, bool,
                                   ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                      ops::SumFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
                       ops::SumFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
@@ -130,7 +132,8 @@ using CPUReduceSumGradKernel =
     ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, T,
                              ops::SumGradFunctor, true>;
 
-REGISTER_OP_CPU_KERNEL(reduce_sum_grad, CPUReduceSumGradKernel<float>,
+REGISTER_OP_CPU_KERNEL(reduce_sum_grad, CPUReduceSumGradKernel<bool>,
+                       CPUReduceSumGradKernel<float>,
                        CPUReduceSumGradKernel<double>,
                        CPUReduceSumGradKernel<int>,
                        CPUReduceSumGradKernel<int64_t>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
index 219cc231a1ea7a0786026d6dcc6d63ce78e24025..dbd020514b2088a336184c4f1ca4f367dd3a14a3 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
@@ -70,7 +70,8 @@ class ReduceSumKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
+REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<bool>,
+                        ops::ReduceSumKernel<float>,
                         ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
                         ops::ReduceSumKernel<int64_t>,
                         ops::ReduceSumKernel<paddle::platform::complex64>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
index f2bee6dddc39ec965966e4964c954e5fb1441bf5..67de8bb9a0c1ab4ae917b7e267fc2748087d900e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
@@ -20,7 +20,8 @@ using CUDAReduceSumGradKernel =
     ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T,
                           ops::SumGradFunctor, true>;
 
-REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel<float>,
+REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
+                        CUDAReduceSumGradKernel<float>,
                         CUDAReduceSumGradKernel<double>,
                         CUDAReduceSumGradKernel<int>,
                         CUDAReduceSumGradKernel<int64_t>,
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index a9b1f299dab82791e6a98afb2b75d65b1703a5a2..a71f49585bfcad0c03f17df647173fd946b4b7e2 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -54,6 +54,21 @@ class ScaleOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -87,6 +102,9 @@ $$Out = scale*(X + bias)$$
         "Apply bias addition after or before scaling. It is useful for "
         "numeric stability in some circumstances.")
         .SetDefault(true);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
   }
 };
 
@@ -112,6 +130,8 @@ class ScaleGradMaker : public framework::SingleGradOpMaker<T> {
     grad_op->SetAttr("scale", this->GetAttr("scale"));
     grad_op->SetAttr("bias", 0.0f);
     grad_op->SetAttr("bias_after_scale", true);
+    if (grad_op->HasAttr("use_mkldnn"))
+      grad_op->SetAttr("use_mkldnn", this->GetAttr("use_mkldnn"));
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 4aec4c174227921d6b396033d26550145dbd6bb2..8fe456edeabf11742d96b48c08204a29c6028132 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -15,44 +15,481 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/softmax_impl.cuh"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 #include "paddle/fluid/platform/for_range.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/miopen_helper.h"
+#else
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
 
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using DataLayout = platform::DataLayout;
 using Tensor = framework::Tensor;
 
-namespace {
+// Wrapper of log function. Use log(float32) for float16
 template <typename T>
-__global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
-                                 const int64_t n, const int64_t d,
-                                 const int64_t remain, const int ignore_index) {
-  CUDA_KERNEL_LOOP_TYPE(index, n * remain, int64_t) {
-    int64_t idx_n = index / remain;
-    int64_t idx_remain = index % remain;
-    int64_t tmp = labels[index];
-    if (ignore_index != tmp) {
-      int64_t idx = idx_n * d + tmp * remain + idx_remain;
-      logit_grad[idx] -= static_cast<T>(1.);
+static __device__ __forceinline__ T Log(T x) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+  AccT logx = std::log(static_cast<AccT>(x));
+  return math::TolerableValue<T>()(static_cast<T>(logx));
+}
+
+// Wrapper of exp function. Use exp(float32) for float16
+template <typename T>
+static __device__ __forceinline__ T Exp(T x) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+  AccT expx = std::exp(static_cast<AccT>(x));
+  return math::TolerableValue<T>()(static_cast<T>(expx));
+}
+
+// log2(value)
+static inline int Log2Ceil(int value) {
+  int log2_value = 0;
+  while ((1 << log2_value) < value) ++log2_value;
+  return log2_value;
+}
+
+enum class SoftmaxMode { kSoftmax, kLogSoftmax, kCrossEntropy };
+
+/*
+  Hard label cross entropy.
+*/
+template <typename T, bool IgnoreIndex>
+__global__ void CrossEntropyHardLabel(T* loss, const T* softmax,
+                                      const int64_t* labels, const int n,
+                                      const int dim, const int d,
+                                      const int ignore_idx) {
+  int64_t ids = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t idx_n = ids / d;
+  int64_t idx_d = ids % d;
+
+  // thread ids compute loss[ids] using softmax[idx]
+  if (ids < n * d) {
+    int64_t idx = idx_n * dim * d + labels[ids] * d + idx_d;
+    if (IgnoreIndex == true) {
+      // IgnoreIndex is true
+      if (labels[ids] == ignore_idx) {
+        loss[ids] = static_cast<T>(0.0);
+      } else {
+        loss[ids] = -Log(softmax[idx]);
+      }
+    } else {
+      // IgnoreIndex is false
+      loss[ids] = -Log(softmax[idx]);
     }
   }
 }
 
+/*
+  Hard label cross entropy with exp.
+  Input: log softmax
+  Output: loss and exp(input)
+*/
+template <typename T, bool IgnoreIndex>
+__global__ void CrossEntropyExpHardLabel(T* loss, T* softmax,
+                                         const int64_t* labels, const int n,
+                                         const int dim, const int d,
+                                         const int ignore_idx) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t idx_n = idx / (d * dim);
+  int64_t idx_dim = (idx / d) % dim;
+  int64_t idx_d = idx % d;
+  int64_t ids = idx_n * d + idx_d;
+
+  if (idx < n * dim * d) {
+    if (IgnoreIndex == true) {
+      // IgnoreIndex is true
+      if (idx_dim == labels[ids]) {
+        if (labels[ids] == ignore_idx) {
+          loss[ids] = static_cast<T>(0.0);
+        } else {
+          loss[ids] = -softmax[idx];
+        }
+      }
+    } else {
+      // IgnoreIndex is false
+      if (labels[ids] >= 0 && labels[ids] < dim) {
+        if (labels[ids] == idx_dim) {
+          loss[ids] = -softmax[idx];
+        }
+      } else {
+        loss[ids] = static_cast<T>(0.0);
+      }
+    }
+    softmax[idx] = Exp(softmax[idx]);
+  }
+}
+
+/*
+  Core function of softmax with cross entropy forward
+    - softmax, SoftmaxMode=kSoftmax
+    - log softmax, SoftmaxMode=kLogSoftmax
+    - softmax with cross entropy hard label, SoftmaxMode=kCrossEntropy
+  The computation includes
+    - Compute max value: maxvalue_{i} = max_j src_{i,j}
+    - Compute sum of exp: s_{i} = sum_{j}{e^{src_{i,j} - maxvalue_{i}}}
+    - Compute: softmax_{i,j} = e^{src_{i,j} - maxvalue_{i}} / s_{i}
+    - Compute: logsoftmax_{i,j} = src_{i,j} - maxvalue_{i} - log(s_{i})
+    - Compute: loss_{i} = -logsoftmax[i,label[i]] (Hard label)
+  This computation results from following formula:
+    softmax_{i,j} = e^{src_{i,j}} / sum_{j}{e^{src_{i,j}}}
+                  = e^{src_{i,j} - maxvalue_{i}}
+                    / sum_{j}{e^{src_{i,j} - maxvalue_{i}}}
+                  = e^{src_{i,j} - maxvalue_{i}} / s_{i}
+    logsoftmax_{i,j} = log(softmax_{i,j})
+                     = src_{i,j} - maxvalue_{i} - log(s_{i})
+  One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
+  For reduction max (sum), firstly compute max (sum) to one warp, then use
+  shuffle api to compute max (sum) in one warp.
+*/
+template <typename T, typename VecT, typename AccT, int Log2Elements,
+          SoftmaxMode mode, bool IgnoreIndex>
+__global__ void WarpSoftmaxForward(T* loss, T* softmax, const T* src,
+                                   const int64_t* label, const int batch_size,
+                                   const int stride, const int element_count,
+                                   const int ignore_index) {
+  constexpr int kDimCeil = 1 << Log2Elements;
+  constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+  constexpr int kVSize = sizeof(VecT) / sizeof(T);
+  constexpr int kIterations = kDimCeil / kWarpSize;
+  constexpr int kIterationsV =
+      (kIterations >= kVSize) ? (kIterations / kVSize) : 1;
+  constexpr int kBatchSize = (kDimCeil <= 128) ? 2 : 1;
+
+  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize;
+
+  // max index to read
+  int idx_max_v[kBatchSize];
+#pragma unroll
+  for (int i = 0; i < kBatchSize; i++) {
+    int idx_max = ((i + first_batch) < batch_size) ? element_count : 0;
+    idx_max_v[i] = idx_max / kVSize;
+  }
+
+  // read data from global memory
+  AccT srcdata[kBatchSize][kIterationsV][kVSize];
+
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+// read data to srcdata: - KVSize==1, - KVSize>1
+#pragma unroll
+    for (int it = 0; it < kIterationsV; ++it) {
+      int src_idx = threadIdx.x + it * kWarpSize;
+      if (kVSize == 1) {
+        if (src_idx < idx_max_v[i]) {
+          srcdata[i][it][0] =
+              static_cast<AccT>(src[(first_batch + i) * stride + src_idx]);
+        } else {
+          srcdata[i][it][0] = -std::numeric_limits<AccT>::infinity();
+        }
+      } else {
+        const VecT* src_v =
+            reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
+        if (src_idx < idx_max_v[i]) {
+          VecT srctmp = src_v[src_idx];
+          const T* srcinptr = reinterpret_cast<const T*>(&srctmp);
+#pragma unroll
+          for (int s = 0; s < kVSize; s++) {
+            srcdata[i][it][s] = static_cast<AccT>(srcinptr[s]);
+          }
+        } else {
+#pragma unroll
+          for (int s = 0; s < kVSize; s++) {
+            srcdata[i][it][s] = -std::numeric_limits<AccT>::infinity();
+          }
+        }
+      }
+    }
+  }
+
+  // compute max value: maxvalue_{i} = max_j src_{i,j}
+  AccT max_value[kBatchSize];
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+    // it = 0
+    AccT valmax = srcdata[i][0][0];
+#pragma unroll
+    for (int s = 1; s < kVSize; ++s) {
+      valmax = (valmax > srcdata[i][0][s]) ? valmax : srcdata[i][0][s];
+    }
+    max_value[i] = valmax;
+
+// it = 1, 2, ...
+#pragma unroll
+    for (int it = 1; it < kIterationsV; ++it) {
+      AccT valmax = srcdata[i][it][0];
+#pragma unroll
+      for (int s = 1; s < kVSize; ++s) {
+        valmax = (valmax > srcdata[i][it][s]) ? valmax : srcdata[i][it][s];
+      }
+      max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax;
+    }
+  }
+  WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
+
+  // compute sum: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} }
+  AccT sum[kBatchSize];
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+    // it = 0
+    if (mode == SoftmaxMode::kLogSoftmax ||
+        mode == SoftmaxMode::kCrossEntropy) {
+      sum[i] = std::exp(srcdata[i][0][0] - max_value[i]);
+    } else {
+      srcdata[i][0][0] = std::exp(srcdata[i][0][0] - max_value[i]);
+      sum[i] = srcdata[i][0][0];
+    }
+#pragma unroll
+    for (int s = 1; s < kVSize; ++s) {
+      if (mode == SoftmaxMode::kLogSoftmax ||
+          mode == SoftmaxMode::kCrossEntropy) {
+        sum[i] += std::exp(srcdata[i][0][s] - max_value[i]);
+      } else {
+        srcdata[i][0][s] = std::exp(srcdata[i][0][s] - max_value[i]);
+        sum[i] += srcdata[i][0][s];
+      }
+    }
+
+// it = 1, 2, ...
+#pragma unroll
+    for (int it = 1; it < kIterationsV; ++it) {
+#pragma unroll
+      for (int s = 0; s < kVSize; ++s) {
+        if (mode == SoftmaxMode::kLogSoftmax ||
+            mode == SoftmaxMode::kCrossEntropy) {
+          sum[i] += std::exp(srcdata[i][it][s] - max_value[i]);
+        } else {
+          srcdata[i][it][s] = std::exp(srcdata[i][it][s] - max_value[i]);
+          sum[i] += srcdata[i][it][s];
+        }
+      }
+    }
+  }
+  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
+
+// write data
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+    if (mode == SoftmaxMode::kLogSoftmax ||
+        mode == SoftmaxMode::kCrossEntropy) {
+      sum[i] = std::log(sum[i]);
+    }
+
+#pragma unroll
+    for (int it = 0; it < kIterationsV; ++it) {
+      int idx = threadIdx.x + it * kWarpSize;
+      if (kVSize == 1) {  // kVSize==1
+        if (idx < idx_max_v[i]) {
+          if (mode == SoftmaxMode::kLogSoftmax) {  // log softmax
+            softmax[(first_batch + i) * stride + idx] =
+                srcdata[i][it][0] - max_value[i] - sum[i];
+            // softmax with cross entropy hard label
+          } else if (mode == SoftmaxMode::kCrossEntropy) {
+            AccT logsoftmax = srcdata[i][it][0] - max_value[i] - sum[i];
+            // softmax
+            softmax[(first_batch + i) * stride + idx] = std::exp(logsoftmax);
+            // label
+            int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize;
+            if (IgnoreIndex == true) {
+              // IgnoreIndex is true
+              if (label[first_batch + i] == loss_idx) {
+                if (label[first_batch + i] != ignore_index) {
+                  loss[first_batch + i] = -logsoftmax;
+                } else {
+                  loss[first_batch + i] = static_cast<T>(0.0);
+                }
+              }
+            } else {
+              // IgnoreIndex is false
+              if (label[first_batch + i] >= 0 &&
+                  label[first_batch + i] < element_count) {
+                if (label[first_batch + i] == loss_idx) {
+                  loss[first_batch + i] = -logsoftmax;
+                }
+              } else {
+                loss[first_batch + i] = static_cast<T>(0.0);
+              }
+            }
+          } else {  // softmax
+            softmax[(first_batch + i) * stride + idx] =
+                srcdata[i][it][0] / sum[i];
+          }
+        } else {
+          break;
+        }
+      } else {  // KVSize>1
+        VecT* softmax_v =
+            reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
+        VecT tmpdata;
+        T* tmpptr = reinterpret_cast<T*>(&tmpdata);
+#pragma unroll
+        for (int s = 0; s < kVSize; ++s) {
+          if (mode == SoftmaxMode::kLogSoftmax) {  // log softmax
+            tmpptr[s] = srcdata[i][it][s] - max_value[i] - sum[i];
+            // softmax with cross entropy hard label
+          } else if (mode == SoftmaxMode::kCrossEntropy) {
+            AccT logsoftmax = srcdata[i][it][s] - max_value[i] - sum[i];
+            // softmax
+            tmpptr[s] = std::exp(logsoftmax);
+            // label
+            int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize + s;
+            if (IgnoreIndex == true) {
+              // IgnoreIndex is true
+              if (label[first_batch + i] == loss_idx &&
+                  label[first_batch + i] != ignore_index) {
+                loss[first_batch + i] = -logsoftmax;
+              }
+            } else {
+              // IgnoreIndex is false
+              if (label[first_batch + i] >= 0 &&
+                  label[first_batch + i] < element_count) {
+                if (label[first_batch + i] == loss_idx) {
+                  loss[first_batch + i] = -logsoftmax;
+                }
+              } else {
+                loss[first_batch + i] = static_cast<T>(0.0);
+              }
+            }
+          } else {  // softmax
+            tmpptr[s] = srcdata[i][it][s] / sum[i];
+          }
+        }
+        if (idx < idx_max_v[i]) {
+          softmax_v[idx] = tmpdata;
+        } else {
+          break;
+        }
+      }
+    }
+  }
+}
+
+#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, VecT, AccT)           \
+  case Log2Elements:                                                  \
+    WarpSoftmaxForward<T, VecT, AccT, Log2Elements, mode,             \
+                       IgnoreIndex><<<blocks, threads, 0, stream>>>(  \
+        loss, softmax, src, label, batch_size, stride, element_count, \
+        ignore_index);                                                \
+    break;
+
+/*
+  Wrapper of softmax with cross entropy forward hard label.
+*/
+template <typename T, SoftmaxMode mode, bool IgnoreIndex>
+void SwitchWarpSoftmaxForward(T* loss, T* softmax, const T* src,
+                              const int64_t* label, const int batch_size,
+                              const int stride, const int element_count,
+                              const int ignore_index, gpuStream_t stream) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+
+  // use 128 threads per block to maximimize gpu utilization
+  const int Log2Elements = static_cast<int>(Log2Ceil(element_count));
+  const int kDimCeil = 1 << Log2Elements;
+  int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+  int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
+  constexpr int threads_per_block = 128;
+  int warps_per_block = (threads_per_block / kWarpSize);
+  int batches_per_block = warps_per_block * batches_per_warp;
+  int blocks = (batch_size + batches_per_block - 1) / batches_per_block;
+  dim3 threads(kWarpSize, warps_per_block, 1);
+
+  switch (Log2Elements) {
+    SOFTMAX_WARP_FORWARD_CASE(0, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(1, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(2, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(3, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(4, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(5, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(6, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(7, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(8, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(9, T, AccT);
+    default:
+      break;
+  }
+}
+
+/*
+  Wrapper of softmax with cross entropy hard label.
+  - SwitchWarpSoftmaxForward for small size
+  - cudnn function for large size
+*/
+template <typename T, bool IgnoreIndex>
+static void SoftmaxWithCrossEntropyHardLabel(
+    const platform::CUDADeviceContext& ctx, int rank, int axis,
+    const T* logits_data, const int64_t* labels_data, T* loss_data,
+    T* softmax_data, int N, int dim, int D, const int ignore_index) {
+  auto stream = ctx.stream();
+  constexpr int max_dim = 320;
+  if (D == 1 && dim <= max_dim) {  // small size
+    const SoftmaxMode mode = SoftmaxMode::kCrossEntropy;
+    SwitchWarpSoftmaxForward<T, mode, IgnoreIndex>(
+        loss_data, softmax_data, logits_data, labels_data, N, dim, dim,
+        ignore_index, stream);
+  } else {
+    ScopedTensorDescriptor desc;
+    std::vector<int> tensor_dims = {N, dim, D, 1};
+    DataLayout layout = DataLayout::kNCHW;
+#ifdef PADDLE_WITH_HIP
+    miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
+#else
+    cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
+#endif
+
+    auto handle = ctx.cudnn_handle();
+
+#ifdef PADDLE_WITH_HIP
+    auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
+                                 : MIOPEN_SOFTMAX_MODE_CHANNEL;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+        handle, platform::CudnnDataType<T>::kOne(), descp, logits_data,
+        platform::CudnnDataType<T>::kZero(), descp, softmax_data,
+        MIOPEN_SOFTMAX_LOG, mode));
+#else
+    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+        handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
+        descp, logits_data, platform::CudnnDataType<T>::kZero(), descp,
+        softmax_data));
+#endif
+    int threads = 128;
+    int blocks = (N * dim * D + threads - 1) / threads;
+    // compute cross entropy, input is log softmax
+    CrossEntropyExpHardLabel<T, IgnoreIndex><<<blocks, threads, 0, stream>>>(
+        loss_data, softmax_data, labels_data, N, dim, D, ignore_index);
+  }
+}
+
+/*
+  Wrapper of softmax with cross entropy grad hard label.
+*/
 template <typename T>
-__global__ void Scale(T* logit_grad, const T* loss_grad, const int64_t num,
-                      const int64_t d, const int64_t remain,
-                      const int64_t* labels, const int ignore_index) {
-  CUDA_KERNEL_LOOP_TYPE(index, num, int64_t) {
-    int64_t idx_n = index / d;
-    int64_t idx_remain = index % remain;
-    int64_t idx_lbl = idx_n * remain + idx_remain;
-    if (labels[idx_lbl] == ignore_index) {
-      logit_grad[index] = static_cast<T>(0.);
+__global__ void SoftmaxWithCrossEntropyGradHardLabel(
+    T* logits_grad, const T* loss_grad, const int64_t* labels, const int64_t n,
+    const int64_t dim, const int64_t d, const int ignore_index) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t idx_n = idx / (d * dim);
+  int64_t idx_dim = (idx / d) % dim;
+  int64_t idx_d = idx % d;
+  int64_t ids = idx_n * d + idx_d;
+
+  if (idx < n * dim * d) {
+    if (labels[ids] == ignore_index) {
+      logits_grad[idx] = static_cast<T>(0.0);
+    } else if (labels[ids] == idx_dim) {
+      logits_grad[idx] =
+          (logits_grad[idx] - static_cast<T>(1.0)) * loss_grad[ids];
     } else {
-      logit_grad[index] *= loss_grad[idx_lbl];
+      logits_grad[idx] *= loss_grad[ids];
     }
   }
 }
@@ -123,8 +560,6 @@ __global__ void ScaleCrossEntropyGradient(T* logit_grad, const T* loss_grad,
   }
 }
 
-}  // namespace
-
 static __device__ __forceinline__ platform::float16 exp_on_device(
     platform::float16 x) {
   return ::Eigen::numext::exp(x);
@@ -396,278 +831,6 @@ static __global__ void RowReductionForCrossEntropy(const T* logits_data,
   if (threadIdx.x == 0) loss_data[blockIdx.x] = loss;
 }
 
-template <typename T>
-struct HardLabelCrossEntropyFunctor {
- public:
-  HardLabelCrossEntropyFunctor(const int64_t* labels, T* loss,
-                               const T* logits_data, int d, int axis_dim)
-      : labels_(labels),
-        loss_(loss),
-        logits_data_(logits_data),
-        d_(d),
-        axis_dim_(axis_dim) {}
-
-  __device__ void operator()(int idx) const {
-    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
-    int remain = d_ / axis_dim_;
-    int idx_n = idx / d_;
-    int idx_axis = (idx % d_) / remain;
-    int idx_remain = idx % remain;
-    // labels, loss view as [n, remain]
-    int idx_lbl = idx_n * remain + idx_remain;
-    // It also would ignore labels not in range(class_num).
-    if (idx_axis != labels_[idx_lbl]) {
-    } else {
-      loss_[idx_lbl] = -log_on_device(logits_data_[idx]);
-    }
-  }
-
- private:
-  const int64_t* labels_;
-  T* loss_;
-  const T* logits_data_;
-  int d_;
-  int axis_dim_;
-};
-
-template <typename T>
-struct HardLabelCrossEntropyFunctorWithIgnoreIdx {
- public:
-  HardLabelCrossEntropyFunctorWithIgnoreIdx(const int64_t* labels, T* loss,
-                                            const T* logits_data, int d,
-                                            int axis_dim, int ignore_idx)
-      : labels_(labels),
-        loss_(loss),
-        logits_data_(logits_data),
-        d_(d),
-        axis_dim_(axis_dim),
-        ignore_idx_(ignore_idx) {}
-
-  __device__ void operator()(int idx) const {
-    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
-    int remain = d_ / axis_dim_;
-    int idx_n = idx / d_;
-    int idx_axis = (idx % d_) / remain;
-    int idx_remain = idx % remain;
-    // labels, loss view as [n, remain]
-    int idx_lbl = idx_n * remain + idx_remain;
-
-    if (idx_axis == labels_[idx_lbl] && idx_axis != ignore_idx_) {
-      loss_[idx_lbl] = -log_on_device(logits_data_[idx]);
-    }
-  }
-
- private:
-  const int64_t* labels_;
-  T* loss_;
-  const T* logits_data_;
-  int d_;
-  int axis_dim_;
-  int ignore_idx_;
-};
-
-template <typename T>
-static void HardLabelCrossEntropy(const platform::CUDADeviceContext& ctx,
-                                  const T* logits_data,
-                                  const int64_t* labels_data, T* loss_data,
-                                  int n, int d, int axis_dim, int ignore_idx) {
-  constexpr int kMaxBlockDim = 512;
-  int block_dim = axis_dim >= kMaxBlockDim
-                      ? kMaxBlockDim
-                      : (1 << static_cast<int>(std::log2(axis_dim)));
-  int grid_dim = n * d / axis_dim;
-  auto stream = ctx.stream();
-
-#define CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)                \
-  case BlockDim: {                                                          \
-    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);   \
-    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                         \
-      for_range(HardLabelCrossEntropyFunctorWithIgnoreIdx<T>(               \
-          labels_data, loss_data, logits_data, d, axis_dim, ignore_idx));   \
-    } else {                                                                \
-      for_range(HardLabelCrossEntropyFunctor<T>(labels_data, loss_data,     \
-                                                logits_data, d, axis_dim)); \
-    }                                                                       \
-  } break
-
-  switch (block_dim) {
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(512);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(256);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(128);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(64);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(32);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(16);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(8);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(4);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(2);
-    default:
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
-      break;
-  }
-#undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
-}
-
-template <typename T>
-struct HardLabelSoftmaxWithCrossEntropyFunctor {
- public:
-  HardLabelSoftmaxWithCrossEntropyFunctor(const int64_t* labels, T* loss,
-                                          T* log_softmax, int64_t d,
-                                          int axis_dim, int ignore_idx)
-      : labels_(labels),
-        loss_(loss),
-        log_softmax_(log_softmax),
-        d_(d),
-        axis_dim_(axis_dim),
-        ignore_idx_(ignore_idx) {}
-
-  __device__ void operator()(int64_t idx) const {
-    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
-    int64_t remain = d_ / axis_dim_;
-    int64_t idx_n = idx / d_;
-    int64_t idx_axis = (idx % d_) / remain;
-    int64_t idx_remain = idx % remain;
-    // labels, loss view as [n, remain]
-    int64_t idx_lbl = idx_n * remain + idx_remain;
-    PADDLE_ENFORCE(labels_[idx_lbl] >= 0 && labels_[idx_lbl] < d_ ||
-                       labels_[idx_lbl] == ignore_idx_,
-                   "The value of label[%ld] expected >= 0 and < %ld, or == %d,"
-                   "but got %ld. Please check input value.",
-                   idx_lbl, d_, ignore_idx_, labels_[idx_lbl]);
-    // It also would ignore labels not in range(class_num).
-    if (idx_axis != labels_[idx_lbl]) {
-      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
-    } else {
-      auto softmax = log_softmax_[idx];
-      log_softmax_[idx] = exp_on_device(softmax);
-      loss_[idx_lbl] = -softmax;
-    }
-  }
-
- private:
-  const int64_t* labels_;
-  T* loss_;
-  T* log_softmax_;
-  int64_t d_;
-  int axis_dim_;
-  int ignore_idx_;
-};
-
-template <typename T>
-struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
- public:
-  HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx(const int64_t* labels,
-                                                       T* loss, T* log_softmax,
-                                                       int64_t d, int axis_dim,
-                                                       int ignore_idx)
-      : labels_(labels),
-        loss_(loss),
-        log_softmax_(log_softmax),
-        d_(d),
-        axis_dim_(axis_dim),
-        ignore_idx_(ignore_idx) {}
-
-  __device__ void operator()(int64_t idx) const {
-    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
-    int64_t remain = d_ / axis_dim_;
-    int64_t idx_n = idx / d_;
-    int64_t idx_axis = (idx % d_) / remain;
-    int64_t idx_remain = idx % remain;
-    // labels, loss view as [n, remain]
-    int64_t idx_lbl = idx_n * remain + idx_remain;
-    if (idx_axis != labels_[idx_lbl] || idx_axis == ignore_idx_) {
-      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
-    } else {
-      auto softmax = log_softmax_[idx];
-      log_softmax_[idx] = exp_on_device(softmax);
-      loss_[idx_lbl] = -softmax;
-    }
-  }
-
- private:
-  const int64_t* labels_;
-  T* loss_;
-  T* log_softmax_;
-  int64_t d_;
-  int axis_dim_;
-  int ignore_idx_;
-};
-
-template <typename T>
-static void HardLabelSoftmaxWithCrossEntropy(
-    const platform::CUDADeviceContext& ctx, const T* logits_data,
-    const int64_t* labels_data, T* loss_data, T* softmax_data, int64_t n,
-    int64_t d, int axis_dim, int ignore_idx) {
-#ifdef __HIPCC__
-  // HIP platform will have loss nan if dim size > 256
-  constexpr int kMaxBlockDim = 256;
-#else
-  constexpr int kMaxBlockDim = 512;
-#endif
-  int64_t block_dim = axis_dim >= kMaxBlockDim
-                          ? kMaxBlockDim
-                          : (1 << static_cast<int>(std::log2(axis_dim)));
-  int64_t grid_dim = n * d / axis_dim;
-  auto stream = ctx.stream();
-
-#ifdef __HIPCC__
-#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)      \
-  case BlockDim: {                                                             \
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForMax<T, BlockDim>),       \
-                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
-                       loss_data, d, axis_dim);                                \
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForSum<T, BlockDim>),       \
-                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
-                       loss_data, softmax_data, d, axis_dim);                  \
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForDiff<T, BlockDim>),      \
-                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
-                       loss_data, softmax_data, d, axis_dim);                  \
-    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);      \
-    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                            \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(       \
-          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx));     \
-    } else {                                                                   \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                    \
-          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx));     \
-    }                                                                          \
-  } break
-#else
-#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)  \
-  case BlockDim: {                                                         \
-    RowReductionForMax<T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>(    \
-        logits_data, loss_data, d, axis_dim);                              \
-    RowReductionForDiffMaxSum<T, BlockDim,                                 \
-                              true><<<grid_dim, BlockDim, 0, stream>>>(    \
-        logits_data, loss_data, softmax_data, d, axis_dim);                \
-    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);  \
-    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                        \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(   \
-          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx)); \
-    } else {                                                               \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                \
-          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx)); \
-    }                                                                      \
-  } break
-#endif
-
-  switch (block_dim) {
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
-    default:
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
-      break;
-  }
-#undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
-}
-
 template <typename T>
 static void SoftmaxWithCrossEntropyFusedKernel(
     const T* logits_data, const T* labels_data, T* softmax_data, T* loss_data,
@@ -783,7 +946,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
 
       const int rank = softmax->dims().size();
       const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-      int axis_dim = softmax->dims()[axis];
+      const int axis_dim = softmax->dims()[axis];
 
       const int n = SizeToAxis(axis, softmax->dims());
       const int d = SizeFromAxis(axis, softmax->dims());
@@ -826,9 +989,19 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
       } else {  // HardLabel
         auto* logits_data = softmax->data<T>();
         auto* labels_data = labels->data<int64_t>();
-        HardLabelCrossEntropy<T>(context.cuda_device_context(), logits_data,
-                                 labels_data, loss_data, n, d, axis_dim,
-                                 ignore_index);
+        int threads = 128;
+        int blocks = (n * d / axis_dim + threads - 1) / threads;
+        if (ignore_index >= 0 && ignore_index < axis_dim) {
+          CrossEntropyHardLabel<T, true><<<
+              blocks, threads, 0, context.cuda_device_context().stream()>>>(
+              loss_data, logits_data, labels_data, n, axis_dim, d / axis_dim,
+              ignore_index);
+        } else {
+          CrossEntropyHardLabel<T, false><<<
+              blocks, threads, 0, context.cuda_device_context().stream()>>>(
+              loss_data, logits_data, labels_data, n, axis_dim, d / axis_dim,
+              ignore_index);
+        }
       }
 
       // cause of input is softmax
@@ -886,9 +1059,17 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
       } else {
         auto* logits_data = logits->data<T>();
         auto* labels_data = labels->data<int64_t>();
-        HardLabelSoftmaxWithCrossEntropy<T>(
-            context.cuda_device_context(), logits_data, labels_data, loss_data,
-            softmax_data, n, d, axis_dim, ignore_index);
+        if (ignore_index >= 0 && ignore_index < axis_dim) {
+          SoftmaxWithCrossEntropyHardLabel<T, true>(
+              context.cuda_device_context(), rank, axis, logits_data,
+              labels_data, loss_data, softmax_data, n, axis_dim, d / axis_dim,
+              ignore_index);
+        } else {
+          SoftmaxWithCrossEntropyHardLabel<T, false>(
+              context.cuda_device_context(), rank, axis, logits_data,
+              labels_data, loss_data, softmax_data, n, axis_dim, d / axis_dim,
+              ignore_index);
+        }
       }
     }
   }
@@ -959,14 +1140,11 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
       SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
           logit_grad_data, loss_grad_data, label_data, n, d, remain);
     } else {
-      int64_t grid = (n * remain + block - 1) / block;
       const int64_t* label_data = labels->data<int64_t>();
-      CrossEntropyGrad<T><<<grid, block, 0, stream>>>(
-          logit_grad_data, label_data, n, d, remain, ignore_index);
-      int64_t num = n * d;
-      grid = (num + block - 1) / block;
-      Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
-                                           d, remain, label_data, ignore_index);
+      int grid = (n * d + block - 1) / block;
+      SoftmaxWithCrossEntropyGradHardLabel<T><<<grid, block, 0, stream>>>(
+          logit_grad_data, loss_grad_data, label_data, n, d / remain, remain,
+          ignore_index);
     }
   }
 };
diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu
index 4800f5f9eb533c047ef53755b88bf2d2f288e99c..9e5e45f4d22d919e9fd037b7d32e1408a5e092dc 100644
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
@@ -96,9 +96,10 @@ class StackGPUKernel : public framework::OpKernel<T> {
 };
 
 template <typename T, typename IntType>
-__global__ void UnStackCUDAKernel(const T* __restrict__ input, int pre_dim_size,
-                                  int split_dim_size, int suf_dim_size,
-                                  int num_split, T** output_ptrs) {
+__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input,
+                                        int pre_dim_size, int split_dim_size,
+                                        int suf_dim_size, int num_split,
+                                        T** output_ptrs) {
   assert(blockDim.y == 1);
   assert(blockDim.z == 1);
   // In this case they are equal
@@ -114,6 +115,9 @@ __global__ void UnStackCUDAKernel(const T* __restrict__ input, int pre_dim_size,
     IntType k = offset % suf_dim_size;
 
     T* output = output_ptrs[j / each_dim_size];
+    if (output == nullptr) {
+      return;
+    }
     IntType output_ind = i * each_dim_size * suf_dim_size +
                          (j % each_dim_size) * suf_dim_size + k;
     *(output + output_ind) = input[offset];
@@ -142,6 +146,9 @@ class StackGradGPUKernel : public framework::OpKernel<T> {
     std::vector<T*> outputs(n);
     auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
     for (size_t j = 0; j < dx.size(); ++j) {
+      if (dx[j] == nullptr) {
+        outputs[j] = nullptr;
+      }
       if (out_var_names[j] != framework::kEmptyVarName &&
           dx[j]->numel() != 0UL) {
         T* ptr = dx[j]->mutable_data<T>(ctx.GetPlace());
@@ -170,13 +177,13 @@ class StackGradGPUKernel : public framework::OpKernel<T> {
     auto config = GetGpuLaunchConfig1D(dev_ctx, dy_pre * split_dim * dy_suf);
 
     if (dy->numel() < std::numeric_limits<int32_t>::max()) {
-      UnStackCUDAKernel<
+      UnStackHelperCUDAKernel<
           T, int32_t><<<config.block_per_grid.x, config.thread_per_block.x, 0,
                         dev_ctx.stream()>>>(
           dy_data, dy_pre, split_dim, dy_suf, split_dim,
           reinterpret_cast<T**>(tmp_out_data->ptr()));
     } else {
-      UnStackCUDAKernel<
+      UnStackHelperCUDAKernel<
           T, int64_t><<<config.block_per_grid.x, config.thread_per_block.x, 0,
                         dev_ctx.stream()>>>(
           dy_data, dy_pre, split_dim, dy_suf, split_dim,
diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc
index 684bd476b6ef21bf58a990c36b1ee6f820d82caf..9785e73a4044ebb345a442dd71ae04b42e55cad7 100644
--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ b/paddle/fluid/operators/top_k_op_npu.cc
@@ -48,7 +48,7 @@ class TopkNPUKernel : public framework::OpKernel<T> {
     size_t k = static_cast<int>(ctx.Attr<int>("k"));
 
     output->mutable_data<T>(ctx.GetPlace());
-    indices->mutable_data<int>(ctx.GetPlace());
+    indices->mutable_data<int64_t>(ctx.GetPlace());
 
     // prepare assit
     auto dim = input->dims().size();
@@ -62,15 +62,24 @@ class TopkNPUKernel : public framework::OpKernel<T> {
                                              {"dim", -1},
                                              {"largest", true}};
 
+    Tensor tmp_indices(framework::proto::VarType::INT32);
+    tmp_indices.Resize(indices->dims());
+    tmp_indices.mutable_data<int>(ctx.GetPlace());
+
     // run ascend
     auto runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor},
-                              {*output, *indices}, attr_input);
-
+                              {*output, tmp_indices}, attr_input);
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
-
     runner.Run(stream);
+
+    // cast indices from INT32 to INT64
+    auto dst_dtype = ConvertToNpuDtype(indices->type());
+    auto runner_cast_indices =
+        NpuOpRunner("Cast", {tmp_indices}, {*indices},
+                    {{"dst_type", static_cast<int>(dst_dtype)}});
+    runner_cast_indices.Run(stream);
   }
 };
 
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index 2bd2a2cbf34c6ccba1e6bfd1892f0f821d0f7c72..99793ecd244cf2594a2b0b7462a492bc3f4a27af 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -405,13 +405,13 @@ class UniqueKernel : public framework::OpKernel<T> {
     bool return_counts = context.Attr<bool>("return_counts");
 
     if (axis_vec.empty()) {
-      framework::VisitDataTypeSmall(
+      framework::VisitDataTypeTiny(
           data_type,
           UniqueFlattendTensorFunctor<DeviceContext, T>(
               context, *x, out, return_index, return_inverse, return_counts));
     } else {
       int axis = axis_vec[0];
-      framework::VisitDataTypeSmall(
+      framework::VisitDataTypeTiny(
           data_type, UniqueDimFunctor<DeviceContext, T>(
                          context, *x, out, axis, return_index, return_inverse,
                          return_counts));
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index cd8b31d72e72adba6232b703e9d2513c90e46cdf..8262273b7ca7da47dc47a2e7a02fa1f40b9d4727 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -30,6 +30,7 @@ register_unity_group(cc
     bmm_op.cc
     bpr_loss_op.cc
     cast_op.cc
+    mkldnn/cast_mkldnn_op.cc
     cholesky_op.cc
     chunk_eval_op.cc
     clip_by_norm_op.cc
@@ -234,6 +235,7 @@ register_unity_group(cc
     save_combine_op.cc
     save_op.cc
     scale_op.cc
+    mkldnn/scale_mkldnn_op.cc
     scatter_nd_add_op.cc
     scatter_op.cc
     seed_op.cc
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 0827d6a5ae7644579ffc2ab502893ec1e6ab1ee2..12a54fd7e87f447530888b85f15a66d1e16fdc9c 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -187,10 +187,12 @@ endif()
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
 cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
+cc_test(complex_test SRCS complex_test.cc DEPS lod_tensor)
 
 IF(WITH_GPU)
   nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
   nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor)
+  nv_test(complex_gpu_test SRCS complex_test.cu DEPS lod_tensor)
   nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
   nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
 ENDIF()
diff --git a/paddle/fluid/platform/complex.h b/paddle/fluid/platform/complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c1b42ea4882d563a5338256947339d3ab49aab4
--- /dev/null
+++ b/paddle/fluid/platform/complex.h
@@ -0,0 +1,537 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <complex>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#ifdef PADDLE_WITH_CUDA
+#include <cuComplex.h>
+#include <thrust/complex.h>
+#endif  // PADDLE_WITH_CUDA
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_complex.h>
+#include <thrust/complex.h>  // NOLINT
+#endif
+
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
+
+#if (defined(__CUDACC__) || defined(__HIPCC__))
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// todo
+#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX
+#endif
+
+namespace paddle {
+namespace platform {
+
+template <typename T>
+struct PADDLE_ALIGN(sizeof(T) * 2) complex {
+ public:
+  T real;
+  T imag;
+
+  complex() = default;
+  complex(const complex<T>& o) = default;
+  complex& operator=(const complex<T>& o) = default;
+  complex(complex<T>&& o) = default;
+  complex& operator=(complex<T>&& o) = default;
+  ~complex() = default;
+
+  HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+  template <typename T1>
+  HOSTDEVICE inline explicit complex(const thrust::complex<T1>& c) {
+    real = c.real();
+    imag = c.imag();
+  }
+
+  template <typename T1>
+  HOSTDEVICE inline explicit operator thrust::complex<T1>() const {
+    return thrust::complex<T1>(real, imag);
+  }
+
+#ifdef PADDLE_WITH_HIP
+  HOSTDEVICE inline explicit operator hipFloatComplex() const {
+    return make_hipFloatComplex(real, imag);
+  }
+
+  HOSTDEVICE inline explicit operator hipDoubleComplex() const {
+    return make_hipDoubleComplex(real, imag);
+  }
+#else
+  HOSTDEVICE inline explicit operator cuFloatComplex() const {
+    return make_cuFloatComplex(real, imag);
+  }
+
+  HOSTDEVICE inline explicit operator cuDoubleComplex() const {
+    return make_cuDoubleComplex(real, imag);
+  }
+#endif
+#endif
+
+  template <typename T1,
+            typename std::enable_if<std::is_floating_point<T1>::value ||
+                                        std::is_integral<T1>::value,
+                                    int>::type = 0>
+  HOSTDEVICE complex(const T1& val) {
+    real = static_cast<T>(val);
+    imag = static_cast<T>(0.0);
+  }
+
+  template <typename T1 = T>
+  HOSTDEVICE explicit complex(
+      const std::enable_if_t<std::is_same<T1, float>::value, complex<double>>&
+          val) {
+    real = val.real;
+    imag = val.imag;
+  }
+
+  template <typename T1 = T>
+  HOSTDEVICE explicit complex(
+      const std::enable_if_t<std::is_same<T1, double>::value, complex<float>>&
+          val) {
+    real = val.real;
+    imag = val.imag;
+  }
+
+  template <typename T1>
+  HOSTDEVICE inline explicit operator std::complex<T1>() const {
+    return static_cast<std::complex<T1>>(std::complex<T>(real, imag));
+  }
+
+  template <typename T1>
+  HOSTDEVICE complex(const std::complex<T1>& val)
+      : real(val.real()), imag(val.imag()) {}
+
+  template <typename T1,
+            typename std::enable_if<std::is_floating_point<T1>::value ||
+                                        std::is_integral<T1>::value,
+                                    int>::type = 0>
+  HOSTDEVICE inline complex& operator=(const T1& val) {
+    real = static_cast<T>(val);
+    imag = static_cast<T>(0.0);
+    return *this;
+  }
+
+  HOSTDEVICE inline explicit operator bool() const {
+    return static_cast<bool>(this->real) || static_cast<bool>(this->imag);
+  }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator float() const {
+    return static_cast<float>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(this->real);
+  }
+};
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator+(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) + thrust::complex<T>(b));
+#else
+  return complex<T>(a.real + b.real, a.imag + b.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator-(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) - thrust::complex<T>(b));
+#else
+  return complex<T>(a.real - b.real, a.imag - b.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator*(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) * thrust::complex<T>(b));
+#else
+  return complex<T>(a.real * b.real - a.imag * b.imag,
+                    a.imag * b.real + b.imag * a.real);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator/(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) / thrust::complex<T>(b));
+#else
+  T denominator = b.real * b.real + b.imag * b.imag;
+  return complex<T>((a.real * b.real + a.imag * b.imag) / denominator,
+                    (a.imag * b.real - a.real * b.imag) / denominator);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator-(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(-thrust::complex<T>(a.real, a.imag));
+#else
+  complex<T> res;
+  res.real = -a.real;
+  res.imag = -a.imag;
+  return res;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator+=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) +=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  a.real += b.real;
+  a.imag += b.imag;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator-=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) -=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  a.real -= b.real;
+  a.imag -= b.imag;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator*=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) *=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  a.real = a.real * b.real - a.imag * b.imag;
+  a.imag = a.imag * b.real + b.imag * a.real;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator/=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) /=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  T denominator = b.real * b.real + b.imag * b.imag;
+  a.real = (a.real * b.real + a.imag * b.imag) / denominator;
+  a.imag = (a.imag * b.real - a.real * b.imag) / denominator;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> raw_uint16_to_complex64(uint16_t a) {
+  complex<T> res;
+  res.real = a;
+  res.imag = 0.0;
+  return res;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator==(const complex<T>& a, const complex<T>& b) {
+  return a.real == b.real && a.imag == b.imag;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator!=(const complex<T>& a, const complex<T>& b) {
+  return a.real != b.real || a.imag != b.imag;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator<(const complex<T>& a, const complex<T>& b) {
+  return a.real < b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator<=(const complex<T>& a, const complex<T>& b) {
+  return a.real <= b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator>(const complex<T>& a, const complex<T>& b) {
+  return a.real > b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator>=(const complex<T>& a, const complex<T>& b) {
+  return a.real >= b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> max(const complex<T>& a, const complex<T>& b) {
+  return (a.real >= b.real) ? a : b;
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> min(const complex<T>& a, const complex<T>& b) {
+  return (a.real < b.real) ? a : b;
+}
+
+template <typename T>
+HOSTDEVICE inline bool(isnan)(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return ::isnan(a.real) || ::isnan(a.imag);
+#else
+  return std::isnan(a.real) || std::isnan(a.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline bool isinf(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return ::isinf(a.real) || ::isinf(a.imag);
+#else
+  return std::isinf(a.real) || std::isinf(a.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline bool isfinite(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return ::isfinite(a.real) || ::isfinite(a.imag);
+#else
+  return std::isfinite(a.real) || std::isfinite(a.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline T abs(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return thrust::abs(thrust::complex<T>(a));
+#else
+  return std::abs(std::complex<T>(a));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> pow(const complex<T>& a, const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::pow(thrust::complex<T>(a), thrust::complex<T>(b)));
+#else
+  return complex<T>(std::pow(std::complex<T>(a), std::complex<T>(b)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> sqrt(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::sqrt(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::sqrt(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> tanh(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::tanh(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::tanh(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> log(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::log(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::log(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const complex<T>& a) {
+  os << "real:" << a.real << " imag:" << a.imag;
+  return os;
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+namespace std {
+
+template <typename T>
+struct is_pod<paddle::platform::complex<T>> {
+  static const bool value = true;
+};
+
+template <typename T>
+struct is_floating_point<paddle::platform::complex<T>>
+    : std::integral_constant<bool, false> {};
+
+template <typename T>
+struct is_signed<paddle::platform::complex<T>> {
+  static const bool value = false;
+};
+
+template <typename T>
+struct is_unsigned<paddle::platform::complex<T>> {
+  static const bool value = false;
+};
+
+template <typename T>
+inline bool isnan(const paddle::platform::complex<T>& a) {
+  return paddle::platform::isnan(a);
+}
+
+template <typename T>
+inline bool isinf(const paddle::platform::complex<T>& a) {
+  return paddle::platform::isinf(a);
+}
+
+template <typename T>
+struct numeric_limits<paddle::platform::complex<T>> {
+  static const bool is_specialized = false;
+  static const bool is_signed = false;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = false;
+  static const bool has_quiet_NaN = false;
+  static const bool has_signaling_NaN = false;
+  static const float_denorm_style has_denorm = denorm_absent;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_toward_zero;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 0;
+  static const int digits10 = 0;
+  static const int max_digits10 = 0;
+  static const int radix = 0;
+  static const int min_exponent = 0;
+  static const int min_exponent10 = 0;
+  static const int max_exponent = 0;
+  static const int max_exponent10 = 0;
+  static const bool traps = false;
+  static const bool tinyness_before = false;
+
+  static paddle::platform::complex<T> min() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> lowest() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> max() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> epsilon() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> round_error() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> infinity() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> quiet_NaN() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> signaling_NaN() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> denorm_min() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+};
+
+}  // namespace std
diff --git a/paddle/fluid/platform/complex_test.cc b/paddle/fluid/platform/complex_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d13161e94faf910829fd93543e6c18990ea7813
--- /dev/null
+++ b/paddle/fluid/platform/complex_test.cc
@@ -0,0 +1,324 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/complex.h"
+#include <complex>
+#include "paddle/fluid/platform/eigen_ext.h"
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+TEST(complex, conversion_cpu) {
+  // *********** complex<float> *************
+  // float to complex<float>
+  EXPECT_EQ(complex<float>().real, 0.0f);
+  EXPECT_EQ(complex<float>().imag, 0.0f);
+
+  EXPECT_EQ(complex<float>(1.0f, 1.0f).real, 1.0f);
+  EXPECT_EQ(complex<float>(1.0f, 1.0f).imag, 1.0f);
+  EXPECT_EQ(complex<float>(0.0f, 1.0f).real, 0.0f);
+  EXPECT_EQ(complex<float>(0.0f, 1.0f).imag, 1.0f);
+
+  EXPECT_EQ(complex<float>(1.0f).real, 1.0f);
+  EXPECT_EQ(complex<float>(1.0f).imag, 0.0f);
+
+  // int to complex<float>
+  EXPECT_EQ(complex<float>(1).real, 1.0f);
+  EXPECT_EQ(complex<float>(0).real, 0.0f);
+  EXPECT_EQ(complex<float>(2).real, 2.0f);
+  EXPECT_EQ(complex<float>(-2).real, -2.0f);
+
+  // bool to complex
+  EXPECT_EQ(complex<float>(true).real, 1.0f);
+  EXPECT_EQ(complex<float>(true).imag, 0.0f);
+
+  // complex<double> to complex<float>
+  EXPECT_EQ(complex<float>(complex<double>(1.0, 2.0)).real, 1.0f);
+  EXPECT_EQ(complex<float>(complex<double>(1.0, 2.0)).imag, 2.0f);
+
+  // std::complex<float> to complex<float>
+  EXPECT_EQ(complex<float>(std::complex<float>(1.0f, 2.0f)).real, 1.0f);
+  EXPECT_EQ(complex<float>(std::complex<float>(1.0f, 2.0f)).imag, 2.0f);
+  EXPECT_EQ(complex<float>(std::complex<double>(1.0, 2.0)).real, 1.0f);
+  EXPECT_EQ(complex<float>(std::complex<double>(1.0, 2.0)).imag, 2.0f);
+
+  // Assignment operator
+  complex<float> c = 1.0f;
+  EXPECT_EQ(c.real, 1.0f);
+  EXPECT_EQ(c.imag, 0.0f);
+  c = complex<float>(2.0, 2.0);
+  EXPECT_EQ(c.real, 2.0f);
+  EXPECT_EQ(c.imag, 2.0f);
+
+  // Conversion operator
+  EXPECT_EQ(static_cast<float>(complex<float>(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(complex<float>(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(complex<float>(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(complex<float>(true)), true);
+
+  // *********** complex<double> *************
+  // double to complex<double>
+  EXPECT_EQ(complex<double>().real, 0.0);
+  EXPECT_EQ(complex<double>().imag, 0.0);
+
+  EXPECT_EQ(complex<double>(1.0, 1.0).real, 1.0);
+  EXPECT_EQ(complex<double>(1.0, 1.0).imag, 1.0);
+  EXPECT_EQ(complex<double>(0.0, 1.0).real, 0.0);
+  EXPECT_EQ(complex<double>(0.0, 1.0).imag, 1.0);
+
+  EXPECT_EQ(complex<double>(1.0).real, 1.0);
+  EXPECT_EQ(complex<double>(1.0).imag, 0.0);
+
+  // int to complex<double>
+  EXPECT_EQ(complex<double>(1).real, 1.0);
+  EXPECT_EQ(complex<double>(0).real, 0.0);
+  EXPECT_EQ(complex<double>(2).real, 2.0);
+  EXPECT_EQ(complex<double>(-2).real, -2.0);
+
+  // bool to complex
+  EXPECT_EQ(complex<double>(true).real, 1.0);
+  EXPECT_EQ(complex<double>(true).imag, 0.0);
+
+  // complex<float> to complex<double>
+  EXPECT_EQ(complex<double>(complex<float>(1.0f, 2.0f)).real, 1.0);
+  EXPECT_EQ(complex<double>(complex<float>(1.0f, 2.0f)).imag, 2.0);
+
+  // std::complex<float> to complex<double>
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).real, 1.0);
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).imag, 2.0);
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).real, 1.0);
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).imag, 2.0);
+
+  // Assignment operator
+  complex<double> c1 = 1.0;
+  EXPECT_EQ(c1.real, 1.0);
+  EXPECT_EQ(c1.imag, 0.0);
+  c1 = complex<double>(2.0, 2.0);
+  EXPECT_EQ(c1.real, 2.0);
+  EXPECT_EQ(c1.imag, 2.0);
+
+  // Conversion operator
+  EXPECT_EQ(static_cast<double>(complex<double>(0.5)), 0.5);
+  EXPECT_NEAR(static_cast<double>(complex<double>(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(complex<double>(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(complex<double>(true)), true);
+}
+
+TEST(bfloat16, comparison_cpu) {
+  // *********** complex<float> *************
+  EXPECT_TRUE(complex<float>(1.0f) == complex<float>(1.0f));
+  EXPECT_TRUE(complex<float>(1.0f, 2.0f) == complex<float>(1.0f, 2.0f));
+  EXPECT_FALSE(complex<float>(-1.0f) == complex<float>(-0.5f));
+  EXPECT_TRUE(complex<float>(1.0f) != complex<float>(0.5f));
+  EXPECT_FALSE(complex<float>(-1.0f) != complex<float>(-1.0f));
+  EXPECT_TRUE(complex<float>(1.0f) < complex<float>(2.0f));
+  EXPECT_FALSE(complex<float>(-1.0f) < complex<float>(-1.0f));
+  EXPECT_TRUE(complex<float>(1.0f) <= complex<float>(1.0f));
+  EXPECT_TRUE(complex<float>(2.0f) > complex<float>(1.0f));
+  EXPECT_FALSE(complex<float>(-2.0f) > complex<float>(-2.0f));
+  EXPECT_TRUE(complex<float>(2.0f) >= complex<float>(2.0f));
+
+  // *********** complex<double> *************
+  EXPECT_TRUE(complex<double>(1.0) == complex<double>(1.0));
+  EXPECT_TRUE(complex<double>(1.0, 2.0) == complex<double>(1.0, 2.0));
+  EXPECT_FALSE(complex<double>(-1.0) == complex<double>(-0.5f));
+  EXPECT_TRUE(complex<double>(1.0) != complex<double>(0.5f));
+  EXPECT_FALSE(complex<double>(-1.0) != complex<double>(-1.0));
+  EXPECT_TRUE(complex<double>(1.0) < complex<double>(2.0));
+  EXPECT_FALSE(complex<double>(-1.0) < complex<double>(-1.0));
+  EXPECT_TRUE(complex<double>(1.0) <= complex<double>(1.0));
+  EXPECT_TRUE(complex<double>(2.0) > complex<double>(1.0));
+  EXPECT_FALSE(complex<double>(-2.0) > complex<double>(-2.0));
+  EXPECT_TRUE(complex<double>(2.0) >= complex<double>(2.0));
+}
+
+TEST(complex, arithmetic_cpu) {
+  // *********** complex<float> *************
+  complex<float> a = complex<float>(1, 1) + complex<float>(1, 1);
+  EXPECT_NEAR(a.real, 2, 0.001);
+  EXPECT_NEAR(a.imag, 2, 0.001);
+
+  complex<float> b = complex<float>(-5, -5) + complex<float>(5, 5);
+  EXPECT_EQ(b.real, 0);
+  EXPECT_EQ(b.imag, 0);
+
+  complex<float> c =
+      complex<float>(0.33333f, 0.33333f) + complex<float>(0.66667f, 0.66667f);
+  EXPECT_NEAR(c.real, 1.0f, 0.01);
+  EXPECT_NEAR(c.imag, 1.0f, 0.01);
+
+  complex<float> d = complex<float>(3) - complex<float>(5);
+  EXPECT_EQ(d.real, -2);
+  EXPECT_EQ(d.imag, 0);
+
+  complex<float> e =
+      complex<float>(0.66667f, 0.66667f) - complex<float>(0.33333f, 0.33333f);
+  EXPECT_NEAR(e.real, 0.33334f, 0.01);
+  EXPECT_NEAR(e.imag, 0.33334f, 0.01);
+
+  complex<float> f = complex<float>(0.33f, 0.33f) * complex<float>(0.2f, 0.2f);
+  EXPECT_NEAR(f.real, 0.0f, 0.01);
+  EXPECT_NEAR(f.imag, 0.132f, 0.01);
+
+  complex<float> g = complex<float>(0.33f, 0.33f) / complex<float>(0.2f, 0.2f);
+  EXPECT_NEAR(g.real, 1.65f, 0.01);
+  EXPECT_NEAR(g.imag, 0.0f, 0.01);
+
+  complex<float> h = -complex<float>(0.33f, 0.33f);
+  EXPECT_NEAR(h.real, -0.33f, 0.01);
+  EXPECT_NEAR(h.imag, -0.33f, 0.01);
+  h = -complex<float>(-0.33f, -0.33f);
+  EXPECT_NEAR(h.real, 0.33f, 0.01);
+  EXPECT_NEAR(h.imag, 0.33f, 0.01);
+
+  complex<float> i = complex<float>(1.0, 1.0);
+  i += complex<float>(2.0, 2.0);
+  EXPECT_NEAR(i.real, 3.0f, 0.01);
+  EXPECT_NEAR(i.imag, 3.0f, 0.01);
+  i -= complex<float>(1.0, 1.0);
+  EXPECT_NEAR(i.real, 2.0f, 0.01);
+  EXPECT_NEAR(i.imag, 2.0f, 0.01);
+  i *= complex<float>(3, 2);
+  EXPECT_NEAR(i.real, 2.0f, 0.01);
+  EXPECT_NEAR(i.imag, 10.0f, 0.01);
+  i /= complex<float>(3, 2);
+  EXPECT_NEAR(i.real, 2.0f, 0.01);
+  EXPECT_NEAR(i.imag, 2.0f, 0.01);
+
+  // *********** complex<double> *************
+  complex<double> a1 = complex<double>(1, 1) + complex<double>(1, 1);
+  EXPECT_NEAR(a1.real, 2, 0.001);
+  EXPECT_NEAR(a1.imag, 2, 0.001);
+
+  complex<double> b1 = complex<double>(-5, -5) + complex<double>(5, 5);
+  EXPECT_EQ(b1.real, 0);
+  EXPECT_EQ(b1.imag, 0);
+
+  complex<double> c1 =
+      complex<double>(0.33333f, 0.33333f) + complex<double>(0.66667f, 0.66667f);
+  EXPECT_NEAR(c1.real, 1.0f, 0.01);
+  EXPECT_NEAR(c1.imag, 1.0f, 0.01);
+
+  complex<double> d1 = complex<double>(3) - complex<double>(5);
+  EXPECT_EQ(d1.real, -2);
+  EXPECT_EQ(d1.imag, 0);
+
+  complex<double> e1 =
+      complex<double>(0.66667f, 0.66667f) - complex<double>(0.33333f, 0.33333f);
+  EXPECT_NEAR(e1.real, 0.33334f, 0.01);
+  EXPECT_NEAR(e1.imag, 0.33334f, 0.01);
+
+  complex<double> f1 =
+      complex<double>(0.33f, 0.33f) * complex<double>(0.2f, 0.2f);
+  EXPECT_NEAR(f1.real, 0.0f, 0.01);
+  EXPECT_NEAR(f1.imag, 0.132f, 0.01);
+
+  complex<double> g1 =
+      complex<double>(0.33f, 0.33f) / complex<double>(0.2f, 0.2f);
+  EXPECT_NEAR(g1.real, 1.65f, 0.01);
+  EXPECT_NEAR(g1.imag, 0.0f, 0.01);
+
+  complex<double> h1 = -complex<double>(0.33f, 0.33f);
+  EXPECT_NEAR(h1.real, -0.33f, 0.01);
+  EXPECT_NEAR(h1.imag, -0.33f, 0.01);
+  h1 = -complex<double>(-0.33f, -0.33f);
+  EXPECT_NEAR(h1.real, 0.33f, 0.01);
+  EXPECT_NEAR(h1.imag, 0.33f, 0.01);
+
+  complex<double> i1 = complex<double>(1.0, 1.0);
+  i1 += complex<double>(2.0, 2.0);
+  EXPECT_NEAR(i1.real, 3.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 3.0f, 0.01);
+  i1 -= complex<double>(1.0, 1.0);
+  EXPECT_NEAR(i1.real, 2.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 2.0f, 0.01);
+  i1 *= complex<double>(3, 2);
+  EXPECT_NEAR(i1.real, 2.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 10.0f, 0.01);
+  i1 /= complex<double>(3, 2);
+  EXPECT_NEAR(i1.real, 2.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 2.0f, 0.01);
+}
+
+TEST(complex, print) {
+  complex<float> a(1.0f);
+  std::cout << a << std::endl;
+
+  complex<double> b(1.0);
+  std::cout << b << std::endl;
+}
+
+TEST(complex, isinf) {
+  // *********** complex<float> *************
+  complex<float> a;
+  a.real = float(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  a.imag = float(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+
+  complex<float> b = float(INFINITY);
+  EXPECT_EQ(std::isinf(b), true);
+
+  complex<float> c(float(INFINITY), 0);
+  EXPECT_EQ(std::isinf(c), true);
+
+  // *********** complex<double> *************
+  complex<double> a1;
+  a1.real = double(INFINITY);
+  EXPECT_EQ(std::isinf(a1), true);
+  a1.imag = double(INFINITY);
+  EXPECT_EQ(std::isinf(a1), true);
+
+  complex<double> b1 = double(INFINITY);
+  EXPECT_EQ(std::isinf(b1), true);
+
+  complex<double> c1(double(INFINITY), 0);
+  EXPECT_EQ(std::isinf(c1), true);
+}
+
+TEST(complex, isnan) {
+  // *********** complex<float> *************
+  complex<float> a;
+  a.real = float(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  a.imag = float(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+
+  complex<float> b = float(NAN);
+  EXPECT_EQ(std::isnan(b), true);
+
+  complex<float> c(float(NAN), 0);
+  EXPECT_EQ(std::isnan(c), true);
+
+  // *********** complex<double> *************
+  complex<double> a1;
+  a1.real = double(NAN);
+  EXPECT_EQ(std::isnan(a1), true);
+  a1.imag = double(NAN);
+  EXPECT_EQ(std::isnan(a1), true);
+
+  complex<double> b1 = double(NAN);
+  EXPECT_EQ(std::isnan(b1), true);
+
+  complex<double> c1(double(NAN), 0);
+  EXPECT_EQ(std::isnan(c1), true);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/complex_test.cu b/paddle/fluid/platform/complex_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b46d1b7b271d78fd436682fa2a5ffae974e61326
--- /dev/null
+++ b/paddle/fluid/platform/complex_test.cu
@@ -0,0 +1,361 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/complex.h"
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <thrust/complex.h>
+#include <bitset>
+#include <iostream>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/eigen_ext.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+namespace paddle {
+namespace platform {
+
+TEST(complex, conversion_on_gpu) {
+  // *********** complex<float> *************
+  // thrust<float> from and to complex<float>
+  complex<float> a(1.0f, 2.0f);
+  EXPECT_EQ(complex<float>(thrust::complex<float>(a)).real, 1.0);
+  EXPECT_EQ(complex<float>(thrust::complex<float>(a)).imag, 2.0);
+
+  complex<double> a1(1.0, 2.0);
+  EXPECT_EQ(complex<double>(thrust::complex<double>(a1)).real, 1.0);
+  EXPECT_EQ(complex<double>(thrust::complex<double>(a1)).imag, 2.0);
+
+#if defined(PADDLE_WITH_HIP)
+  EXPECT_EQ(hipFloatComplex(a).real(), 1.0);
+  EXPECT_EQ(hipFloatComplex(a).imag(), 2.0);
+  EXPECT_EQ(hipDoubleComplex(a).real(), 1.0);
+  EXPECT_EQ(hipDoubleComplex(a).imag(), 2.0);
+
+  EXPECT_EQ(hipFloatComplex(a1).real(), 1.0);
+  EXPECT_EQ(hipFloatComplex(a1).imag(), 2.0);
+  EXPECT_EQ(hipDoubleComplex(a1).real(), 1.0);
+  EXPECT_EQ(hipDoubleComplex(a1).imag(), 2.0);
+#else
+  EXPECT_EQ(cuCrealf(cuFloatComplex(a)), 1.0);
+  EXPECT_EQ(cuCimagf(cuFloatComplex(a)), 2.0);
+  EXPECT_EQ(cuCreal(cuDoubleComplex(a)), 1.0);
+  EXPECT_EQ(cuCimag(cuDoubleComplex(a)), 2.0);
+
+  EXPECT_EQ(cuCrealf(cuFloatComplex(a1)), 1.0);
+  EXPECT_EQ(cuCimagf(cuFloatComplex(a1)), 2.0);
+  EXPECT_EQ(cuCreal(cuDoubleComplex(a1)), 1.0);
+  EXPECT_EQ(cuCimag(cuDoubleComplex(a1)), 2.0);
+#endif
+
+  EXPECT_EQ(complex<float>().real, 0.0f);
+  EXPECT_EQ(complex<float>().imag, 0.0f);
+
+  EXPECT_EQ(complex<float>(1.0f, 1.0f).real, 1.0f);
+  EXPECT_EQ(complex<float>(1.0f, 1.0f).imag, 1.0f);
+  EXPECT_EQ(complex<float>(0.0f, 1.0f).real, 0.0f);
+  EXPECT_EQ(complex<float>(0.0f, 1.0f).imag, 1.0f);
+
+  EXPECT_EQ(complex<float>(1.0f).real, 1.0f);
+  EXPECT_EQ(complex<float>(1.0f).imag, 0.0f);
+
+  // int to complex<float>
+  EXPECT_EQ(complex<float>(1).real, 1.0f);
+  EXPECT_EQ(complex<float>(0).real, 0.0f);
+  EXPECT_EQ(complex<float>(2).real, 2.0f);
+  EXPECT_EQ(complex<float>(-2).real, -2.0f);
+
+  // bool to complex
+  EXPECT_EQ(complex<float>(true).real, 1.0f);
+  EXPECT_EQ(complex<float>(true).imag, 0.0f);
+
+  // complex<double> to complex<float>
+  EXPECT_EQ(complex<float>(complex<double>(1.0, 2.0)).real, 1.0f);
+  EXPECT_EQ(complex<float>(complex<double>(1.0, 2.0)).imag, 2.0f);
+
+  // std::complex<float> to complex<float>
+  EXPECT_EQ(complex<float>(std::complex<float>(1.0f, 2.0f)).real, 1.0f);
+  EXPECT_EQ(complex<float>(std::complex<float>(1.0f, 2.0f)).imag, 2.0f);
+  EXPECT_EQ(complex<float>(std::complex<double>(1.0, 2.0)).real, 1.0f);
+  EXPECT_EQ(complex<float>(std::complex<double>(1.0, 2.0)).imag, 2.0f);
+
+  // Assignment operator
+  complex<float> c = 1.0f;
+  EXPECT_EQ(c.real, 1.0f);
+  EXPECT_EQ(c.imag, 0.0f);
+  c = complex<float>(2.0, 2.0);
+  EXPECT_EQ(c.real, 2.0f);
+  EXPECT_EQ(c.imag, 2.0f);
+
+  // Conversion operator
+  EXPECT_EQ(static_cast<float>(complex<float>(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(complex<float>(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(complex<float>(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(complex<float>(true)), true);
+
+  // *********** complex<double> *************
+  // double to complex<double>
+  EXPECT_EQ(complex<double>().real, 0.0);
+  EXPECT_EQ(complex<double>().imag, 0.0);
+
+  EXPECT_EQ(complex<double>(1.0, 1.0).real, 1.0);
+  EXPECT_EQ(complex<double>(1.0, 1.0).imag, 1.0);
+  EXPECT_EQ(complex<double>(0.0, 1.0).real, 0.0);
+  EXPECT_EQ(complex<double>(0.0, 1.0).imag, 1.0);
+
+  EXPECT_EQ(complex<double>(1.0).real, 1.0);
+  EXPECT_EQ(complex<double>(1.0).imag, 0.0);
+
+  // int to complex<double>
+  EXPECT_EQ(complex<double>(1).real, 1.0);
+  EXPECT_EQ(complex<double>(0).real, 0.0);
+  EXPECT_EQ(complex<double>(2).real, 2.0);
+  EXPECT_EQ(complex<double>(-2).real, -2.0);
+
+  // bool to complex
+  EXPECT_EQ(complex<double>(true).real, 1.0);
+  EXPECT_EQ(complex<double>(true).imag, 0.0);
+
+  // complex<float> to complex<double>
+  EXPECT_EQ(complex<double>(complex<float>(1.0f, 2.0f)).real, 1.0);
+  EXPECT_EQ(complex<double>(complex<float>(1.0f, 2.0f)).imag, 2.0);
+
+  // std::complex<float> to complex<double>
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).real, 1.0);
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).imag, 2.0);
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).real, 1.0);
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).imag, 2.0);
+
+  // Assignment operator
+  complex<double> c1 = 1.0;
+  EXPECT_EQ(c1.real, 1.0);
+  EXPECT_EQ(c1.imag, 0.0);
+  c1 = complex<double>(2.0, 2.0);
+  EXPECT_EQ(c1.real, 2.0);
+  EXPECT_EQ(c1.imag, 2.0);
+
+  // Conversion operator
+  EXPECT_EQ(static_cast<double>(complex<double>(0.5)), 0.5);
+  EXPECT_NEAR(static_cast<double>(complex<double>(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(complex<double>(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(complex<double>(true)), true);
+}
+
+TEST(bfloat16, comparison_cpu) {
+  // *********** complex<float> *************
+  EXPECT_TRUE(complex<float>(1.0f) == complex<float>(1.0f));
+  EXPECT_TRUE(complex<float>(1.0f, 2.0f) == complex<float>(1.0f, 2.0f));
+  EXPECT_FALSE(complex<float>(-1.0f) == complex<float>(-0.5f));
+  EXPECT_TRUE(complex<float>(1.0f) != complex<float>(0.5f));
+  EXPECT_FALSE(complex<float>(-1.0f) != complex<float>(-1.0f));
+  EXPECT_TRUE(complex<float>(1.0f) < complex<float>(2.0f));
+  EXPECT_FALSE(complex<float>(-1.0f) < complex<float>(-1.0f));
+  EXPECT_TRUE(complex<float>(1.0f) <= complex<float>(1.0f));
+  EXPECT_TRUE(complex<float>(2.0f) > complex<float>(1.0f));
+  EXPECT_FALSE(complex<float>(-2.0f) > complex<float>(-2.0f));
+  EXPECT_TRUE(complex<float>(2.0f) >= complex<float>(2.0f));
+
+  // *********** complex<double> *************
+  EXPECT_TRUE(complex<double>(1.0) == complex<double>(1.0));
+  EXPECT_TRUE(complex<double>(1.0, 2.0) == complex<double>(1.0, 2.0));
+  EXPECT_FALSE(complex<double>(-1.0) == complex<double>(-0.5f));
+  EXPECT_TRUE(complex<double>(1.0) != complex<double>(0.5f));
+  EXPECT_FALSE(complex<double>(-1.0) != complex<double>(-1.0));
+  EXPECT_TRUE(complex<double>(1.0) < complex<double>(2.0));
+  EXPECT_FALSE(complex<double>(-1.0) < complex<double>(-1.0));
+  EXPECT_TRUE(complex<double>(1.0) <= complex<double>(1.0));
+  EXPECT_TRUE(complex<double>(2.0) > complex<double>(1.0));
+  EXPECT_FALSE(complex<double>(-2.0) > complex<double>(-2.0));
+  EXPECT_TRUE(complex<double>(2.0) >= complex<double>(2.0));
+}
+
+TEST(complex, arithmetic_cpu) {
+  // *********** complex<float> *************
+  complex<float> a = complex<float>(1, 1) + complex<float>(1, 1);
+  EXPECT_NEAR(a.real, 2, 0.001);
+  EXPECT_NEAR(a.imag, 2, 0.001);
+
+  complex<float> b = complex<float>(-5, -5) + complex<float>(5, 5);
+  EXPECT_EQ(b.real, 0);
+  EXPECT_EQ(b.imag, 0);
+
+  complex<float> c =
+      complex<float>(0.33333f, 0.33333f) + complex<float>(0.66667f, 0.66667f);
+  EXPECT_NEAR(c.real, 1.0f, 0.01);
+  EXPECT_NEAR(c.imag, 1.0f, 0.01);
+
+  complex<float> d = complex<float>(3) - complex<float>(5);
+  EXPECT_EQ(d.real, -2);
+  EXPECT_EQ(d.imag, 0);
+
+  complex<float> e =
+      complex<float>(0.66667f, 0.66667f) - complex<float>(0.33333f, 0.33333f);
+  EXPECT_NEAR(e.real, 0.33334f, 0.01);
+  EXPECT_NEAR(e.imag, 0.33334f, 0.01);
+
+  complex<float> f = complex<float>(0.33f, 0.33f) * complex<float>(0.2f, 0.2f);
+  EXPECT_NEAR(f.real, 0.0f, 0.01);
+  EXPECT_NEAR(f.imag, 0.132f, 0.01);
+
+  complex<float> g = complex<float>(0.33f, 0.33f) / complex<float>(0.2f, 0.2f);
+  EXPECT_NEAR(g.real, 1.65f, 0.01);
+  EXPECT_NEAR(g.imag, 0.0f, 0.01);
+
+  complex<float> h = -complex<float>(0.33f, 0.33f);
+  EXPECT_NEAR(h.real, -0.33f, 0.01);
+  EXPECT_NEAR(h.imag, -0.33f, 0.01);
+  h = -complex<float>(-0.33f, -0.33f);
+  EXPECT_NEAR(h.real, 0.33f, 0.01);
+  EXPECT_NEAR(h.imag, 0.33f, 0.01);
+
+  complex<float> i = complex<float>(1.0, 1.0);
+  i += complex<float>(2.0, 2.0);
+  EXPECT_NEAR(i.real, 3.0f, 0.01);
+  EXPECT_NEAR(i.imag, 3.0f, 0.01);
+  i -= complex<float>(1.0, 1.0);
+  EXPECT_NEAR(i.real, 2.0f, 0.01);
+  EXPECT_NEAR(i.imag, 2.0f, 0.01);
+  i *= complex<float>(3, 2);
+  EXPECT_NEAR(i.real, 2.0f, 0.01);
+  EXPECT_NEAR(i.imag, 10.0f, 0.01);
+  i /= complex<float>(3, 2);
+  EXPECT_NEAR(i.real, 2.0f, 0.01);
+  EXPECT_NEAR(i.imag, 2.0f, 0.01);
+
+  // *********** complex<double> *************
+  complex<double> a1 = complex<double>(1, 1) + complex<double>(1, 1);
+  EXPECT_NEAR(a1.real, 2, 0.001);
+  EXPECT_NEAR(a1.imag, 2, 0.001);
+
+  complex<double> b1 = complex<double>(-5, -5) + complex<double>(5, 5);
+  EXPECT_EQ(b1.real, 0);
+  EXPECT_EQ(b1.imag, 0);
+
+  complex<double> c1 =
+      complex<double>(0.33333f, 0.33333f) + complex<double>(0.66667f, 0.66667f);
+  EXPECT_NEAR(c1.real, 1.0f, 0.01);
+  EXPECT_NEAR(c1.imag, 1.0f, 0.01);
+
+  complex<double> d1 = complex<double>(3) - complex<double>(5);
+  EXPECT_EQ(d1.real, -2);
+  EXPECT_EQ(d1.imag, 0);
+
+  complex<double> e1 =
+      complex<double>(0.66667f, 0.66667f) - complex<double>(0.33333f, 0.33333f);
+  EXPECT_NEAR(e1.real, 0.33334f, 0.01);
+  EXPECT_NEAR(e1.imag, 0.33334f, 0.01);
+
+  complex<double> f1 =
+      complex<double>(0.33f, 0.33f) * complex<double>(0.2f, 0.2f);
+  EXPECT_NEAR(f1.real, 0.0f, 0.01);
+  EXPECT_NEAR(f1.imag, 0.132f, 0.01);
+
+  complex<double> g1 =
+      complex<double>(0.33f, 0.33f) / complex<double>(0.2f, 0.2f);
+  EXPECT_NEAR(g1.real, 1.65f, 0.01);
+  EXPECT_NEAR(g1.imag, 0.0f, 0.01);
+
+  complex<double> h1 = -complex<double>(0.33f, 0.33f);
+  EXPECT_NEAR(h1.real, -0.33f, 0.01);
+  EXPECT_NEAR(h1.imag, -0.33f, 0.01);
+  h1 = -complex<double>(-0.33f, -0.33f);
+  EXPECT_NEAR(h1.real, 0.33f, 0.01);
+  EXPECT_NEAR(h1.imag, 0.33f, 0.01);
+
+  complex<double> i1 = complex<double>(1.0, 1.0);
+  i1 += complex<double>(2.0, 2.0);
+  EXPECT_NEAR(i1.real, 3.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 3.0f, 0.01);
+  i1 -= complex<double>(1.0, 1.0);
+  EXPECT_NEAR(i1.real, 2.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 2.0f, 0.01);
+  i1 *= complex<double>(3, 2);
+  EXPECT_NEAR(i1.real, 2.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 10.0f, 0.01);
+  i1 /= complex<double>(3, 2);
+  EXPECT_NEAR(i1.real, 2.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 2.0f, 0.01);
+}
+
+TEST(complex, print) {
+  complex<float> a(1.0f);
+  std::cout << a << std::endl;
+
+  complex<double> b(1.0);
+  std::cout << b << std::endl;
+}
+
+TEST(complex, isinf) {
+  // *********** complex<float> *************
+  complex<float> a;
+  a.real = float(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  a.imag = float(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+
+  complex<float> b = float(INFINITY);
+  EXPECT_EQ(std::isinf(b), true);
+
+  complex<float> c(float(INFINITY), 0);
+  EXPECT_EQ(std::isinf(c), true);
+
+  // *********** complex<double> *************
+  complex<double> a1;
+  a1.real = double(INFINITY);
+  EXPECT_EQ(std::isinf(a1), true);
+  a1.imag = double(INFINITY);
+  EXPECT_EQ(std::isinf(a1), true);
+
+  complex<double> b1 = double(INFINITY);
+  EXPECT_EQ(std::isinf(b1), true);
+
+  complex<double> c1(double(INFINITY), 0);
+  EXPECT_EQ(std::isinf(c1), true);
+}
+
+TEST(complex, isnan) {
+  // *********** complex<float> *************
+  complex<float> a;
+  a.real = float(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  a.imag = float(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+
+  complex<float> b = float(NAN);
+  EXPECT_EQ(std::isnan(b), true);
+
+  complex<float> c(float(NAN), 0);
+  EXPECT_EQ(std::isnan(c), true);
+
+  // *********** complex<double> *************
+  complex<double> a1;
+  a1.real = double(NAN);
+  EXPECT_EQ(std::isnan(a1), true);
+  a1.imag = double(NAN);
+  EXPECT_EQ(std::isnan(a1), true);
+
+  complex<double> b1 = double(NAN);
+  EXPECT_EQ(std::isnan(b1), true);
+
+  complex<double> c1(double(NAN), 0);
+  EXPECT_EQ(std::isnan(c1), true);
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
\ No newline at end of file
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index dde9531e59144218c91d789a8fe668d3fffb70f2..4095720f71eb7185c474934231220b917a770375 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -16,8 +16,7 @@ limitations under the License. */
 
 // NOTE(): support float16 to half in header file.
 #define PADDLE_CUDA_FP16
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -82,28 +81,52 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
 #endif
 }
 
-// CUDA 9.0 have native compatible float16 shfl_down
 #if defined(PADDLE_WITH_HIP)
 template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                        float16 val, int delta,
                                                        int width) {
-#ifdef PADDLE_WITH_HIP
   return float16(__shfl_down(static_cast<float>(val),
                              static_cast<unsigned>(delta), width));
-#else
-  return float16(
-      __shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
-#endif
 }
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
+    unsigned mask, paddle::platform::complex<float> val, int delta, int width) {
+  float real = __shfl_down(val.real, delta, width);
+  float imag = __shfl_down(val.imag, delta, width);
+  return paddle::platform::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<double>
+CudaShuffleDownSync(unsigned mask, paddle::platform::complex<double> val,
+                    int delta, int width) {
+  double real = __shfl_down(val.real, delta, width);
+  double imag = __shfl_down(val.imag, delta, width);
+  return paddle::platform::complex<double>(real, imag);
+}
+
 template <>
 __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
                                                       float16 val, int width) {
-#ifdef PADDLE_WITH_HIP
   return float16(__shfl_xor(static_cast<float>(val), width));
-#else
-  return float16(__shfl_xor(static_cast<half>(val), width));
-#endif
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
+    unsigned mask, paddle::platform::complex<float> val, int width) {
+  float real = __shfl_xor(val.real, width);
+  float imag = __shfl_xor(val.imag, width);
+  return paddle::platform::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<double> CudaShuffleXorSync(
+    unsigned mask, paddle::platform::complex<double> val, int width) {
+  double real = __shfl_xor(val.real, width);
+  double imag = __shfl_xor(val.imag, width);
+  return paddle::platform::complex<double>(real, imag);
 }
 #else
 template <>
@@ -115,25 +138,26 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
 }
 
 template <>
-__forceinline__ __device__ paddle::platform::complex64 CudaShuffleDownSync(
-    unsigned mask, paddle::platform::complex64 val, int delta, int width) {
+__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
+    unsigned mask, paddle::platform::complex<float> val, int delta, int width) {
   float real = static_cast<float>(__shfl_down_sync(
       mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
   float imag = static_cast<float>(__shfl_down_sync(
       mask, static_cast<float>(val.imag), static_cast<unsigned>(delta), width));
-  return paddle::platform::complex64(real, imag);
+  return paddle::platform::complex<float>(real, imag);
 }
 
 template <>
-__forceinline__ __device__ paddle::platform::complex128 CudaShuffleDownSync(
-    unsigned mask, paddle::platform::complex128 val, int delta, int width) {
+__forceinline__ __device__ paddle::platform::complex<double>
+CudaShuffleDownSync(unsigned mask, paddle::platform::complex<double> val,
+                    int delta, int width) {
   double real = static_cast<double>(
       __shfl_down_sync(mask, static_cast<double>(val.real),
                        static_cast<unsigned>(delta), width));
   double imag = static_cast<double>(
       __shfl_down_sync(mask, static_cast<double>(val.imag),
                        static_cast<unsigned>(delta), width));
-  return paddle::platform::complex128(real, imag);
+  return paddle::platform::complex<double>(real, imag);
 }
 
 template <>
@@ -143,23 +167,23 @@ __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
 }
 
 template <>
-__forceinline__ __device__ paddle::platform::complex64 CudaShuffleXorSync(
-    unsigned mask, paddle::platform::complex64 val, int width) {
+__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
+    unsigned mask, paddle::platform::complex<float> val, int width) {
   float real = static_cast<float>(
       __shfl_xor_sync(mask, static_cast<float>(val.real), width));
   float imag = static_cast<float>(
       __shfl_xor_sync(mask, static_cast<float>(val.imag), width));
-  return paddle::platform::complex64(real, imag);
+  return paddle::platform::complex<float>(real, imag);
 }
 
 template <>
-__forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
-    unsigned mask, paddle::platform::complex128 val, int width) {
+__forceinline__ __device__ paddle::platform::complex<double> CudaShuffleXorSync(
+    unsigned mask, paddle::platform::complex<double> val, int width) {
   double real = static_cast<double>(
       __shfl_xor_sync(mask, static_cast<double>(val.real), width));
   double imag = static_cast<double>(
       __shfl_xor_sync(mask, static_cast<double>(val.imag), width));
-  return paddle::platform::complex128(real, imag);
+  return paddle::platform::complex<double>(real, imag);
 }
 #endif
 
diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h
index 0db4cc71b1b21085513c4703475e651b8d8edd74..4eea87e909d1b45e1f5323f774138529961961c8 100644
--- a/paddle/fluid/platform/eigen_ext.h
+++ b/paddle/fluid/platform/eigen_ext.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
@@ -27,6 +28,8 @@ namespace Eigen {
 using complex64 = paddle::platform::complex64;
 using complex128 = paddle::platform::complex128;
 using float16 = paddle::platform::float16;
+template <typename T>
+using complex = paddle::platform::complex<T>;
 
 template <typename T>
 struct NumTraits;
@@ -105,6 +108,50 @@ struct NumTraits<complex128> : GenericNumTraits<std::complex<double>> {
   static inline int digits10() { return NumTraits<Real>::digits10(); }
 };
 
+template <>
+struct NumTraits<complex<float>> : GenericNumTraits<std::complex<float>> {
+  typedef float Real;
+  typedef typename NumTraits<float>::Literal Literal;
+  enum {
+    IsComplex = 1,
+    RequireInitialization = NumTraits<float>::RequireInitialization,
+    ReadCost = 2 * NumTraits<float>::ReadCost,
+    AddCost = 2 * NumTraits<Real>::AddCost,
+    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
+  };
+
+  EIGEN_DEVICE_FUNC
+  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC
+  static inline Real dummy_precision() {
+    return NumTraits<Real>::dummy_precision();
+  }
+  EIGEN_DEVICE_FUNC
+  static inline int digits10() { return NumTraits<Real>::digits10(); }
+};
+
+template <>
+struct NumTraits<complex<double>> : GenericNumTraits<std::complex<double>> {
+  typedef double Real;
+  typedef typename NumTraits<double>::Literal Literal;
+  enum {
+    IsComplex = 1,
+    RequireInitialization = NumTraits<double>::RequireInitialization,
+    ReadCost = 2 * NumTraits<double>::ReadCost,
+    AddCost = 2 * NumTraits<Real>::AddCost,
+    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
+  };
+
+  EIGEN_DEVICE_FUNC
+  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC
+  static inline Real dummy_precision() {
+    return NumTraits<Real>::dummy_precision();
+  }
+  EIGEN_DEVICE_FUNC
+  static inline int digits10() { return NumTraits<Real>::digits10(); }
+};
+
 template <>
 struct NumTraits<float16> : GenericNumTraits<float16> {
   enum {
@@ -354,6 +401,138 @@ HOSTDEVICE inline double abs(const complex128& a) {
   return paddle::platform::abs(a);
 }
 
+//////////// complex<float> methods /////////////
+
+template <>
+HOSTDEVICE inline bool(isnan)(const complex<float>& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const complex<float>& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const complex<float>& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline complex<float> exp(const complex<float>& a) {
+  float com = ::expf(a.real);
+  float res_real = com * ::cosf(a.imag);
+  float res_imag = com * ::sinf(a.imag);
+  return complex<float>(res_real, res_imag);
+}
+
+template <>
+HOSTDEVICE inline complex<float> log(const complex<float>& a) {
+  return paddle::platform::log(a);
+}
+
+template <>
+HOSTDEVICE inline complex<float> tanh(const complex<float>& a) {
+  return paddle::platform::tanh(a);
+}
+
+template <>
+HOSTDEVICE inline complex<float> sqrt(const complex<float>& a) {
+  return paddle::platform::sqrt(a);
+}
+
+template <>
+HOSTDEVICE inline complex<float> ceil(const complex<float>& a) {
+  return complex<float>(::ceilf(a.real), ::ceilf(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex<float> floor(const complex<float>& a) {
+  return complex<float>(::floorf(a.real), ::floor(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex<float> round(const complex<float>& a) {
+  return complex<float>(::roundf(a.real), ::roundf(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex<float> pow(const complex<float>& a,
+                                     const complex<float>& b) {
+  return paddle::platform::pow(a, b);
+}
+
+template <>
+HOSTDEVICE inline float abs(const complex<float>& a) {
+  return paddle::platform::abs(a);
+}
+
+//////////// complex<double> methods /////////////
+
+template <>
+HOSTDEVICE inline bool(isnan)(const complex<double>& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const complex<double>& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const complex<double>& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline complex<double> exp(const complex<double>& a) {
+  double com = ::expf(a.real);
+  double res_real = com * ::cosf(a.imag);
+  double res_imag = com * ::sinf(a.imag);
+  return complex<double>(res_real, res_imag);
+}
+
+template <>
+HOSTDEVICE inline complex<double> log(const complex<double>& a) {
+  return paddle::platform::log(a);
+}
+
+template <>
+HOSTDEVICE inline complex<double> tanh(const complex<double>& a) {
+  return paddle::platform::tanh(a);
+}
+
+template <>
+HOSTDEVICE inline complex<double> sqrt(const complex<double>& a) {
+  return paddle::platform::sqrt(a);
+}
+
+template <>
+HOSTDEVICE inline complex<double> ceil(const complex<double>& a) {
+  return complex<double>(::ceilf(a.real), ::ceilf(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex<double> floor(const complex<double>& a) {
+  return complex<double>(::floorf(a.real), ::floor(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex<double> round(const complex<double>& a) {
+  return complex<double>(::roundf(a.real), ::roundf(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex<double> pow(const complex<double>& a,
+                                      const complex<double>& b) {
+  return paddle::platform::pow(a, b);
+}
+
+template <>
+HOSTDEVICE inline double abs(const complex<double>& a) {
+  return paddle::platform::abs(a);
+}
+
 //////////// float16 methods /////////////
 
 template <>
diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c5903d62cd277f6d86e542f04de066c96374704
--- /dev/null
+++ b/paddle/fluid/platform/fast_divmod.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.1 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.1
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include "paddle/fluid/platform/hostdevice.h"
+
+#define INT_BITS 32
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Size>
+struct alignas(sizeof(T) * Size) CudaAlignedVector {
+  T val[Size];
+};
+
+struct FastDivMod {
+  // 1st value represents the result of input number divides by recorded divisor
+  // 2nd value represents the result of input number modulo by recorded divisor
+  using DivModT = CudaAlignedVector<uint32_t, 2>;
+
+  FastDivMod() {}
+  HOSTDEVICE FastDivMod(uint32_t d) : divisor(d) {
+    static_assert(sizeof(unsigned int) == 4,
+                  "Only Support 32-bit unsigned int.");
+
+    for (shift_val = 0; shift_val < INT_BITS; ++shift_val) {
+      auto shift_limit = 1 << shift_val;
+      if (shift_limit >= divisor) break;
+    }
+    uint64_t long_one = 1;
+    uint64_t temp_div =
+        ((long_one << INT_BITS) * ((long_one << shift_val) - divisor)) /
+            divisor +
+        1;
+    multiplier = temp_div;
+  }
+
+  __device__ __forceinline__ uint32_t Div(uint32_t n) const {
+    uint32_t t = __umulhi(n, multiplier);
+    return (t + n) >> shift_val;
+  }
+
+  __device__ __forceinline__ DivModT Divmod(uint32_t n) {
+    uint32_t q = Div(n);
+    DivModT result = {q, n - q * divisor};
+    return result;
+  }
+
+  int32_t divisor;
+  int32_t shift_val;
+  uint32_t multiplier;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index e584b849368e41ac12a604ff8924bcb620874b31..d6563be48fe484cae5c54c52c87e5c3a1493e584 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -126,13 +126,20 @@ class MKLDNNHandlerT {
     return (dev_ctx_.GetBlob(key_p) != nullptr);
   }
 
+  bool isCachedNonBlocking() {
+    const std::string key_pd = key_ + "@fwd_pd";
+    fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
+        dev_ctx_.GetBlob(key_pd));
+
+    return (fwd_pd_ != nullptr);
+  }
+
   bool isBwdCached() {
-    const std::string key_pd = key_common_ + "@bwd_pd";
+    const std::string key_pd = key_ + "@bwd_pd";
     bwd_pd_ = std::static_pointer_cast<typename TBackward::primitive_desc>(
         dev_ctx_.GetBlob(key_pd));
 
-    const std::string key_p = key_ + "@bwd_p";
-    return (dev_ctx_.GetBlob(key_p) != nullptr);
+    return (bwd_pd_ != nullptr);
   }
 
   // If your primitive descriptor requires attributes, pass them as a
@@ -161,6 +168,20 @@ class MKLDNNHandlerT {
     }
   }
 
+  template <typename Arg, typename... Args>
+  void AcquireForwardPrimitiveDescriptorNonBlocking(Arg&& first_arg,
+                                                    Args&&... args) {
+    // This is used when we can recreate FWD PD in BWD so
+    // we do not need to pass FWD to BWD
+    const std::string key_pd = key_ + "@fwd_pd";
+    fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
+        dev_ctx_.GetBlob(key_pd));
+    if (fwd_pd_ == nullptr) {
+      CreateForwardPrimitiveDescriptor(first_arg, std::forward<Args>(args)...);
+      dev_ctx_.SetBlob(key_pd, fwd_pd_);
+    }
+  }
+
   // Using sfinae to specialise variadic function. Workaround for not having
   // if constexpr in C++ 11.
   template <class First, class... Args>
@@ -182,6 +203,8 @@ class MKLDNNHandlerT {
         std::make_shared<typename TForward::primitive_desc>(fwd_desc, engine_);
   }
 
+  // TODO(jczaja): After/if all ops can used xxxNonBlocking version
+  // then remove this one
   template <typename... Args>
   void AcquireBackwardPrimitiveDescriptor(Args&&... args) {
     const std::string key_fwd_pd = key_common_ + "@fwd_pd";
@@ -201,6 +224,25 @@ class MKLDNNHandlerT {
     }
   }
 
+  template <typename... Args>
+  void AcquireBackwardPrimitiveDescriptorNonBlocking(Args&&... args) {
+    // fwd_pd_ is set during grad by calling
+    // AcquireForwardPrimitiveDescriptorNonBlocking
+    PADDLE_ENFORCE_NOT_NULL(
+        fwd_pd_,
+        platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.",
+                                      key_ + "@fwd_pd"));
+    const std::string key_pd = key_ + "@bwd_pd";
+    bwd_pd_ = std::static_pointer_cast<typename TBackward::primitive_desc>(
+        dev_ctx_.GetBlob(key_pd));
+    if (bwd_pd_ == nullptr) {
+      auto bwd_desc = typename TBackward::desc(std::forward<Args>(args)...);
+      bwd_pd_ = std::make_shared<typename TBackward::primitive_desc>(
+          bwd_desc, engine_, *fwd_pd_);
+      dev_ctx_.SetBlob(key_pd, bwd_pd_);
+    }
+  }
+
   std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
       const std::string& suffix) {
     return std::static_pointer_cast<mkldnn::memory>(
@@ -781,82 +823,6 @@ class ActivationMKLDNNHandler
   }
 };
 
-template <typename T>
-class LRNMKLDNNHandler
-    : public MKLDNNHandlerT<T, mkldnn::lrn_forward, mkldnn::lrn_backward> {
- public:
-  LRNMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
-                   const platform::MKLDNNDeviceContext& dev_ctx,
-                   const mkldnn::engine mkldnn_engine,
-                   platform::Place cpu_place, const Tensor* input,
-                   const std::string& unique_name)
-
-      : platform::MKLDNNHandlerT<T, mkldnn::lrn_forward, mkldnn::lrn_backward>(
-            dev_ctx, mkldnn_engine, cpu_place,
-            platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
-                                unique_name)) {
-    if (!this->isCached()) {
-      const int n = ctx.Attr<int>("n");
-      // MKL-DNN implements LRN in a caffe way:
-      // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html
-      // Where sum of squares is divided by size of normalization window
-      // this is not the case for PaddlePaddle LRN.
-      // Hence we need to compensate for this diffrence by
-      // multipliing alpha by size of window(n)
-      const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
-      const float beta = ctx.Attr<float>("beta");
-      const float k = ctx.Attr<float>("k");
-      bool is_test = ctx.Attr<bool>("is_test");
-
-      auto dims = paddle::framework::vectorize(input->dims());
-
-      auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(),
-                                         input->format());
-
-      this->AcquireForwardPrimitiveDescriptor(
-          is_test ? mkldnn::prop_kind::forward_inference
-                  : mkldnn::prop_kind::forward_training,
-          mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k);
-    }
-  }
-
-  LRNMKLDNNHandler(const std::vector<int64_t>& dims, const int n,
-                   const float alpha, const float beta, const float k,
-                   const MKLDNNMemoryFormat fmt,
-                   const MKLDNNMemoryFormat diff_fmt,
-                   const platform::MKLDNNDeviceContext& dev_ctx,
-                   platform::Place cpu_place, const std::string& unique_name)
-
-      : platform::MKLDNNHandlerT<T, mkldnn::lrn_forward, mkldnn::lrn_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dev_ctx, dims, unique_name)) {
-    auto src_md =
-        mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), fmt);
-    auto diff_md =
-        mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), diff_fmt);
-
-    this->AcquireBackwardPrimitiveDescriptor(
-        mkldnn::algorithm::lrn_across_channels, src_md, diff_md, n, alpha, beta,
-        k);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWorkspaceMemory(
-      framework::Tensor* workspace) {
-    T* ptr = workspace->mutable_data<T>(
-        this->place_, this->fwd_pd_->workspace_desc().get_size());
-    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(),
-                                            ptr, "@wrk_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBackwardWorkspaceMemory(
-      const framework::Tensor* workspace) {
-    const T* workspace_data = workspace->data<T>();
-    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(),
-                                            to_void_cast<T>(workspace_data),
-                                            "@bwd-wrk_mem_p");
-  }
-};
-
 template <typename T>
 class TransposeMKLDNNHandler : public MKLDNNHandler {
  public:
@@ -960,7 +926,23 @@ class ReorderMKLDNNHandler : public MKLDNNHandler {
       : platform::MKLDNNHandler(dev_ctx, engine, base_key),
         dims_(dims),
         vtype_(vtype),
-        dtype_(dtype) {}
+        vtype_dst_(vtype),
+        dtype_(dtype),
+        dtype_dst_(dtype) {}
+
+  ReorderMKLDNNHandler(std::vector<int64_t>& dims,  // NOLINT
+                       framework::proto::VarType::Type vtype,
+                       mkldnn::memory::data_type dtype,
+                       framework::proto::VarType::Type vtype_dst,
+                       mkldnn::memory::data_type dtype_dst,
+                       const platform::MKLDNNDeviceContext& dev_ctx,
+                       mkldnn::engine engine, const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        dims_(dims),
+        vtype_(vtype),
+        vtype_dst_(vtype_dst),
+        dtype_(dtype),
+        dtype_dst_(dtype_dst) {}
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
       const MKLDNNMemoryFormat& fmt, void* ptr) {
@@ -974,15 +956,16 @@ class ReorderMKLDNNHandler : public MKLDNNHandler {
     auto mem_p =
         std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
     if (mem_p == nullptr) {
-      auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_, fmt);
-      auto dst_data = output->mutable_data(place, vtype_, dst_md.get_size());
+      auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_dst_, fmt);
+      auto dst_data =
+          output->mutable_data(place, vtype_dst_, dst_md.get_size());
 
       mem_p = std::make_shared<mkldnn::memory>(dst_md, engine_, dst_data);
       dev_ctx_.SetBlob(local_key, mem_p);
     } else {
       // Even if memory object exists , we may be using it for diffrent tensor
       auto dst_data =
-          output->mutable_data(place, vtype_, mem_p->get_desc().get_size());
+          output->mutable_data(place, vtype_dst_, mem_p->get_desc().get_size());
       mem_p->set_data_handle(dst_data);
     }
     return mem_p;
@@ -1004,8 +987,8 @@ class ReorderMKLDNNHandler : public MKLDNNHandler {
 
  private:
   std::vector<int64_t> dims_;
-  framework::proto::VarType::Type vtype_;
-  mkldnn::memory::data_type dtype_;
+  framework::proto::VarType::Type vtype_, vtype_dst_;
+  mkldnn::memory::data_type dtype_, dtype_dst_;
 };
 
 template <typename T>
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 91461aa26f341a91f942fc44a70064fa49ece31c..fa14ad4f63be084579951dc09eef5019f109364c 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/distributed/common/sparse_sharding_merge.h"
 #include "paddle/fluid/distributed/communicator_common.h"
 #include "paddle/fluid/distributed/fleet.h"
 #include "paddle/fluid/distributed/index_dataset/index_sampler.h"
@@ -48,6 +49,7 @@ using paddle::distributed::GraphNode;
 using paddle::distributed::GraphPyServer;
 using paddle::distributed::GraphPyClient;
 using paddle::distributed::FeatureNode;
+using paddle::distributed::ShardingMerge;
 
 namespace paddle {
 namespace pybind {
@@ -85,6 +87,12 @@ void BindPSHost(py::module* m) {
       .def("to_string", &distributed::PSHost::to_string);
 }
 
+void BindSparseShardingTools(py::module* m) {
+  py::class_<ShardingMerge>(*m, "ShardingMerge")
+      .def(py::init<>())
+      .def("merge", &ShardingMerge::Merge);
+}
+
 void BindCommunicatorContext(py::module* m) {
   py::class_<CommContext>(*m, "CommContext")
       .def(
diff --git a/paddle/fluid/pybind/fleet_py.h b/paddle/fluid/pybind/fleet_py.h
index 206a69f5a80197b15b5f579faefdad2075461c2c..4dc0f002ad3c1d9580ce8301cc74009555f552a3 100644
--- a/paddle/fluid/pybind/fleet_py.h
+++ b/paddle/fluid/pybind/fleet_py.h
@@ -36,5 +36,6 @@ void BindIndexNode(py::module* m);
 void BindTreeIndex(py::module* m);
 void BindIndexWrapper(py::module* m);
 void BindIndexSampler(py::module* m);
+void BindSparseShardingTools(py::module* m);
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 4bdf1e21bacf16e30612ff81c292dc0eacb42007..ac1fab97644a170e3644354a9036eff6f595da09 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -766,6 +766,13 @@ void BindImperative(py::module *m_ptr) {
                imperative::NameVarBaseMap ins = {{"Input", {self}}};
                imperative::NameVarBaseMap outs = {{"Out", {self}}};
 
+               PADDLE_ENFORCE_EQ(
+                   self->IsLeaf() && !self->OverridedStopGradient(), false,
+                   platform::errors::InvalidArgument(
+                       "Leaf Tensor (%s) that doesn't stop gradient can't use "
+                       "inplace strategy.",
+                       self->Name()));
+
                auto value_tensor =
                    value_obj.cast<std::shared_ptr<imperative::VarBase>>();
                ins.insert({"ValueTensor", {value_tensor}});
@@ -1699,6 +1706,7 @@ void BindImperative(py::module *m_ptr) {
   m.def("varbase_copy", &VarBaseCopy<platform::CPUPlace>);
   m.def("varbase_copy", &VarBaseCopy<platform::CUDAPlace>);
   m.def("varbase_copy", &VarBaseCopy<platform::XPUPlace>);
+  m.def("varbase_copy", &VarBaseCopy<platform::CUDAPinnedPlace>);
 
   m.def(
       "dygraph_partial_grad",
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 560d8c892b09f9b6f17136040455ee8469587f53..6dd08e5dfa4bf235cdab71b3d24760c4884019f2 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -3159,7 +3159,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindTreeIndex(&m);
   BindIndexWrapper(&m);
   BindIndexSampler(&m);
-
+  BindSparseShardingTools(&m);
 #endif
 }
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 416361d06a996e492118a995a6c0aa28ac38dc1a..586cbda7ccfc57bf3eb1250dbe98499e152a88b9 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -84,9 +84,9 @@ struct npy_format_descriptor<paddle::platform::bfloat16> {
   static constexpr auto name = _("bfloat16");
 };
 
-// we register paddle::platform::complex64 as numpy.complex64.
+// we register paddle::platform::complex<float> as numpy.complex64.
 template <>
-struct npy_format_descriptor<paddle::platform::complex64> {
+struct npy_format_descriptor<paddle::platform::complex<float>> {
   static py::dtype dtype() {
     handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_COMPLEX64);
     return reinterpret_borrow<py::dtype>(ptr);
@@ -103,9 +103,8 @@ struct npy_format_descriptor<paddle::platform::complex64> {
   static constexpr auto name = _("complext64");
 };
 
-// we register paddle::platform::complex128 as numpy.complex128.
 template <>
-struct npy_format_descriptor<paddle::platform::complex128> {
+struct npy_format_descriptor<paddle::platform::complex<double>> {
   static py::dtype dtype() {
     handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_COMPLEX128);
     return reinterpret_borrow<py::dtype>(ptr);
@@ -168,8 +167,8 @@ struct ValidDTypeToPyArrayChecker {
 
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::float16);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::bfloat16);
-DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex64);
-DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex128);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex<float>);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex<double>);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(float);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(double);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(bool);
@@ -188,9 +187,9 @@ inline std::string TensorDTypeToPyDTypeStr(
     } else if (std::is_same<T, platform::bfloat16>::value) {                \
       /* NumPy character code of uint16 due to no support for bfloat16 */   \
       return "H";                                                           \
-    } else if (std::is_same<T, platform::complex64>::value) {               \
+    } else if (std::is_same<T, platform::complex<float>>::value) {          \
       return "F";                                                           \
-    } else if (std::is_same<T, platform::complex128>::value) {              \
+    } else if (std::is_same<T, platform::complex<double>>::value) {         \
       return "D";                                                           \
     } else {                                                                \
       constexpr auto kIsValidDType = ValidDTypeToPyArrayChecker<T>::kValue; \
@@ -367,12 +366,14 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj,
   } else if (py::isinstance<py::array_t<paddle::platform::float16>>(array)) {
     SetTensorFromPyArrayT<paddle::platform::float16, P>(self, array, place,
                                                         zero_copy);
-  } else if (py::isinstance<py::array_t<paddle::platform::complex64>>(array)) {
-    SetTensorFromPyArrayT<paddle::platform::complex64, P>(self, array, place,
-                                                          zero_copy);
-  } else if (py::isinstance<py::array_t<paddle::platform::complex128>>(array)) {
-    SetTensorFromPyArrayT<paddle::platform::complex128, P>(self, array, place,
-                                                           zero_copy);
+  } else if (py::isinstance<py::array_t<paddle::platform::complex<float>>>(
+                 array)) {
+    SetTensorFromPyArrayT<paddle::platform::complex<float>, P>(
+        self, array, place, zero_copy);
+  } else if (py::isinstance<py::array_t<paddle::platform::complex<double>>>(
+                 array)) {
+    SetTensorFromPyArrayT<paddle::platform::complex<double>, P>(
+        self, array, place, zero_copy);
   } else if (py::isinstance<py::array_t<uint16_t>>(array)) {
     // since there is still no support for bfloat16 in NumPy,
     // uint16 is used for casting bfloat16
@@ -594,9 +595,9 @@ inline framework::Tensor *_sliceTensor(const framework::Tensor &self,
     case framework::proto::VarType::BF16:
       return _sliceAndConcat<paddle::platform::bfloat16>(self, obj, dim);
     case framework::proto::VarType::COMPLEX64:
-      return _sliceAndConcat<paddle::platform::complex64>(self, obj, dim);
+      return _sliceAndConcat<paddle::platform::complex<float>>(self, obj, dim);
     case framework::proto::VarType::COMPLEX128:
-      return _sliceAndConcat<paddle::platform::complex128>(self, obj, dim);
+      return _sliceAndConcat<paddle::platform::complex<double>>(self, obj, dim);
     case framework::proto::VarType::FP32:
       return _sliceAndConcat<float>(self, obj, dim);
     case framework::proto::VarType::FP64:
diff --git a/paddle/scripts/conda_build.py b/paddle/scripts/conda_build.py
index e9153583f133771650cf115369dae231e6f1a3f0..2fe02dc51bf536d4132395c9c893f3fb1e9fbb74 100644
--- a/paddle/scripts/conda_build.py
+++ b/paddle/scripts/conda_build.py
@@ -44,42 +44,33 @@ build:
         self.requirement_build = r"""
 requirements:
   build:
-    - numpy>=1.12
+    - numpy>=1.13
     - cython
     - setuptools
 """
 
         self.requirement_run = r"""
   run:
-    - numpy>1.12
+    - requests>=2.20.0
+    - numpy>=1.13
+    - protobuf>=3.1.0
+    - gast==0.3.3
+    - Pillow
     - six
     - decorator
-    - nltk
-    - scipy
-    - requests
-    - pillow
-    - graphviz
-    - protobuf
-    - py-cpuinfo==5.0.0
     - astor
-    - gast>=0.3.3
-    - matplotlib
 """
 
         self.requirement_run_windows = r"""
   run:
-    - numpy>=1.12
+    - requests>=2.20.0
+    - numpy>=1.13
+    - protobuf>=3.1.0
+    - gast==0.3.3
+    - Pillow
     - six
     - decorator
-    - nltk
-    - scipy
-    - requests
-    - pillow
-    - graphviz
-    - protobuf
     - astor
-    - gast>=0.3.3
-    - py-cpuinfo==5.0.0
 """
         self.test = r"""
 test:
@@ -96,37 +87,20 @@ about:
 """
 
         self.build_const = r"""
-pip install /package/objgraph-3.4.1.tar.gz
-pip install /package/rarfile-3.0.tar.gz --no-deps
 """
 
         self.blt_const = r""" 
-pip install C:\package\objgraph-3.4.1.tar.gz
-pip install C:\package\rarfile-3.0.tar.gz --no-deps
-git clone https://github.com/PaddlePaddle/recordio.git
-cd recordio\python
-python setup.py install
 """
 
-        self.python27 = r"    - python>=2.7, <3.0"
-        self.python35 = r"    - python>=3.5, <3.6"
         self.python36 = r"    - python>=3.6, <3.7"
         self.python37 = r"    - python>=3.7, <3.8"
         self.python38 = r"    - python>=3.8, <3.9"
+        self.python39 = r"    - python>=3.9, <3.10"
 
         self.python_version = [
-            self.python27, self.python35, self.python36, self.python37,
-            self.python38
+            self.python36, self.python37, self.python38, self.python39
         ]
 
-        self.cuda90 = r"""
-    - cudatoolkit>=9.0, <9.1
-    - cudnn>=7.6, <7.7
-    """
-        self.cuda100 = r"""
-    - cudatoolkit>=10.0, <10.1
-    - cudnn>=7.6, <7.7
-    """
         self.cuda101 = r"""
     - cudatoolkit>=10.1, <10.2
     - cudnn>=7.6, <7.7
@@ -135,30 +109,31 @@ python setup.py install
     - cudatoolkit>=10.2, <10.3
     - cudnn>=7.6, <7.7
     """
-        self.cuda_info = [(self.cuda90, "cuda9.0", ".post90"),
-                          (self.cuda100, "cuda10.0", ".post100"),
-                          (self.cuda101, "cuda10.1", ".post101"),
-                          (self.cuda102, "cuda10.2", "")]
-        self.py_str = ["py27", "py35", "py36", "py37", "py38"]
+        self.cuda112 = r"""
+    - cudatoolkit>=11.2, <11.3
+    - cudnn>=8.1, <8.2
+    """
+
+        self.cuda_info = [(self.cuda101, "cuda10.1", ".post101"),
+                          (self.cuda102, "cuda10.2", ""),
+                          (self.cuda112, "cuda11.2", ".post112")]
+        self.py_str = ["py36", "py37", "py38", "py39"]
         self.pip_end = ".whl --no-deps"
         self.pip_prefix_linux = "pip install /package/paddlepaddle"
         self.pip_prefix_windows = r"pip install C:\package\paddlepaddle"
         self.pip_gpu = "_gpu-"
         self.pip_cpu = "-"
         self.mac_pip = [
-            "-cp27-cp27m-macosx_10_6_intel", "-cp35-cp35m-macosx_10_6_intel",
             "-cp36-cp36m-macosx_10_6_intel", "-cp37-cp37m-macosx_10_6_intel",
-            "-cp38-cp38-macosx_10_14_x86_64"
+            "-cp38-cp38-macosx_10_14_x86_64", "-cp39-cp39-macosx_10_14_x86_64"
         ]
         self.linux_pip = [
-            "-cp27-cp27mu-manylinux1_x86_64", "-cp35-cp35m-manylinux1_x86_64",
-            "-cp36-cp36m-manylinux1_x86_64", "-cp37-cp37m-manylinux1_x86_64",
-            "-cp38-cp38-manylinux1_x86_64"
+            "-cp36-cp36m-linux_x86_64", "-cp37-cp37m-linux_x86_64",
+            "-cp38-cp38-linux_x86_64", "-cp39-cp39-linux_x86_64"
         ]
         self.windows_pip = [
-            "-cp27-cp27m-win_amd64", "-cp35-cp35m-win_amd64",
             "-cp36-cp36m-win_amd64", "-cp37-cp37m-win_amd64",
-            "-cp38-cp38-win_amd64"
+            "-cp38-cp38-win_amd64", "-cp39-cp39-win_amd64"
         ]
 
 
@@ -233,12 +208,7 @@ package:
     requirement = var.requirement_build + python_str + var.requirement_run_windows + python_str
     meta_build = var.build + build_name_str
     meta_str = package_str + meta_build + requirement
-    if (python_str == var.python27 or python_str == var.python35):
-        meta_str = meta_str + """
-    - matplotlib<=2.2.4"""
-    else:
-        meta_str = meta_str + """
-    - matplotlib"""
+
     if not (cuda_str == None):
         meta_str = meta_str + cuda_str
 
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 76915061842d81b80eb39e12a84b2b44fb6e3583..dd8146aa3a1147b2d4e77185647be720468043f7 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -18,7 +18,7 @@ rem       Paddle CI Task On Windows Platform
 rem =================================================
 
 @ECHO ON
-setlocal
+setlocal enabledelayedexpansion
 
 rem -------clean up environment-----------
 set work_dir=%cd%
@@ -40,10 +40,8 @@ taskkill /f /im python.exe  2>NUL
 taskkill /f /im nvcc.exe 2>NUL
 taskkill /f /im cicc.exe 2>NUL
 taskkill /f /im ptxas.exe 2>NUL
-taskkill /f /im test_api_impl.exe 2>NUL
 taskkill /f /im op_function_generator.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
-wmic process where name="test_api_impl.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
 wmic process where name="cl.exe" call terminate 2>NUL
@@ -65,7 +63,7 @@ if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
 if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON
-if not defined WITH_TPCACHE set WITH_TPCACHE=ON
+if not defined WITH_TPCACHE set WITH_TPCACHE=OFF
 if not defined WITH_CLCACHE set WITH_CLCACHE=OFF
 if not defined WITH_CACHE set WITH_CACHE=OFF
 if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF
@@ -238,6 +236,8 @@ call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary
 set DISTUTILS_USE_SDK=1
 rem Windows 10 Kit bin dir
 set PATH=C:\Program Files (x86)\Windows Kits\10\bin\10.0.17763.0\x64;%PATH%
+rem Use 64-bit ToolSet to compile
+set PreferredToolArchitecture=x64
 
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
@@ -265,12 +265,12 @@ rem ------initialize the python environment------
 @ECHO ON
 set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
 set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
-if %WITH_PYTHON% == "ON" (
+if "%WITH_PYTHON%" == "ON" (
     where python
     where pip
     pip install wheel --user
     pip install -r %work_dir%\python\requirements.txt --user
-    if %ERRORLEVEL% NEQ 0 (
+    if !ERRORLEVEL! NEQ 0 (
         echo pip install requirements.txt failed!
         exit /b 7
     )
@@ -331,14 +331,14 @@ if "%WITH_GPU%"=="ON" (
 )
 
 :cmake_impl
-echo cmake .. -G %GENERATOR% -T host=x64 -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
 -DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME%
 
-cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
@@ -368,7 +368,7 @@ echo Build third_party the %build_times% time:
 if %GENERATOR% == "Ninja" (
     ninja third_party
 ) else (
-    MSBuild /m /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:quiet third_party.vcxproj
+    MSBuild /m /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:%LOG_LEVEL% third_party.vcxproj
 )
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1  
@@ -400,22 +400,24 @@ taskkill /f /im csc.exe 2>NUL
 taskkill /f /im nvcc.exe 2>NUL
 taskkill /f /im cicc.exe 2>NUL
 taskkill /f /im ptxas.exe 2>NUL
-taskkill /f /im test_api_impl.exe 2>NUL
 taskkill /f /im op_function_generator.exe 2>NUL
 wmic process where name="cmake.exe" call terminate 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
-wmic process where name="test_api_impl.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
 wmic process where name="cl.exe" call terminate 2>NUL
 wmic process where name="lib.exe" call terminate 2>NUL
 
+if "%WITH_TESTING%"=="ON" (
+    for /F "tokens=1 delims= " %%# in ('tasklist ^| findstr /i test') do taskkill /f /im %%#
+)
+
 echo Build Paddle the %build_times% time:
 if %GENERATOR% == "Ninja" (
-    ninja -j %PARALLEL_PROJECT_COUNT%
+    ninja all
 ) else (
     if "%WITH_CLCACHE%"=="OFF" (
-        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
+        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
     ) else (
         MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
     )
@@ -644,7 +646,7 @@ echo     git fetch upstream $BRANCH # develop is not fetched>>  check_change_of_
 echo fi>>  check_change_of_unittest.sh
 echo git checkout -b origin_pr >>  check_change_of_unittest.sh
 echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
-echo cmake .. -G %GENERATOR% -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+echo cmake .. -G %GENERATOR% -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
@@ -776,15 +778,16 @@ taskkill /f /im python.exe  2>NUL
 taskkill /f /im nvcc.exe 2>NUL
 taskkill /f /im cicc.exe 2>NUL
 taskkill /f /im ptxas.exe 2>NUL
-taskkill /f /im test_api_impl.exe 2>NUL
 taskkill /f /im op_function_generator.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
-wmic process where name="test_api_impl.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
 wmic process where name="cl.exe" call terminate 2>NUL
 wmic process where name="lib.exe" call terminate 2>NUL
 wmic process where name="python.exe" call terminate 2>NUL
+if "%WITH_TESTING%"=="ON" (
+    for /F "tokens=1 delims= " %%# in ('tasklist ^| findstr /i test') do taskkill /f /im %%#
+)
 echo Windows CI run successfully!
 exit /b 0
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 7d9a01106285bed530c2d13b6d79f31330afeb3d..ff3ded9f9ea56e611557323b577532a45182f911 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1417,6 +1417,175 @@ EOF
     fi
 }
 
+function insert_pile_to_h_cu_diff {
+    # TODO get develop h/cu md5
+    cd ${PADDLE_ROOT}
+    find ${PADDLE_ROOT} -name '*.h'| grep -v ${PADDLE_ROOT}/build >> ${PADDLE_ROOT}/tools/h_cu_files.log
+    find ${PADDLE_ROOT} -name '*.cu'| grep -v ${PADDLE_ROOT}/build >> ${PADDLE_ROOT}/tools/h_cu_files.log
+    python ${PADDLE_ROOT}/tools/handle_h_cu_file.py 'get_h_file_md5' ${PADDLE_ROOT}
+    
+    # TODO insert pile to diff h/cu file 
+
+    #insert pile to full h/cu file 
+    python ${PADDLE_ROOT}/tools/handle_h_cu_file.py 'insert_pile_to_h_file' ${PADDLE_ROOT}
+}
+
+function precise_card_test_single {
+    set +e
+    set +x
+    testcases=$1
+    num=$2
+    for case in $(echo $testcases | tr "$|^" "\n")
+    do
+        cd ${PADDLE_ROOT}/build
+        precise_card_test "^${case}$" $num
+        # c++ 
+        if [ -d "${PADDLE_ROOT}/build/ut_map/$case" ];then
+            rm -rf ${PADDLE_ROOT}/build/ut_map/$case
+        fi
+        set -x
+        mkdir ${PADDLE_ROOT}/build/ut_map/$case
+        find paddle/fluid -name '*.gcda'|xargs -I {} cp --path {} ut_map/$case
+        find paddle/fluid -name '*.gcno'|xargs -I {} cp --path {} ut_map/$case
+        python ${PADDLE_ROOT}/tools/get_single_test_cov.py ${PADDLE_ROOT} $case &
+        
+        # python
+        ls python-coverage.data.*
+        if [[ $? == 0 ]]
+        then
+            mkdir -p ${PADDLE_ROOT}/build/pytest/$case
+            mv python-coverage.data.* ${PADDLE_ROOT}/build/pytest/$case
+        fi
+        find paddle/fluid -name *.gcda | xargs rm -f #delete gcda
+    done
+}
+
+function precise_card_test() {
+    set -m
+    testcases=$1
+    if (( $# > 1 )); then
+        cardnumber=$2
+        cuda_list="0"
+        if [ $cardnumber -eq 2 ]; then
+            cuda_list=${CUDA_VISIBLE_DEVICES}
+        else
+            cuda_list="0"
+        fi
+    else
+        cardnumber=2
+        cuda_list=${CUDA_VISIBLE_DEVICES}
+    fi
+
+    if [[ "$testcases" == "" ]]; then
+        return 0
+    fi
+
+    echo "****************************************************************"
+    echo "***Running ut: $testcases***"
+    echo "****************************************************************"
+    
+    tmpfile=$tmp_dir/$testcases".log"
+    env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I 0,,1 -R "($testcases)" --timeout 500 --output-on-failure -V -j 1 > $tmpfile 
+    set +m
+}
+
+function get_precise_tests_map_file {
+    cd ${PADDLE_ROOT}/build
+    pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    ut_total_startTime_s=`date +%s`
+    EXIT_CODE=0;
+    test_cases=$(ctest -N -V) # get all test cases
+    single_card_tests='' # all cases list which would take one graph card
+    exclusive_tests=''        # cases list which would be run exclusively
+    multiple_card_tests=''    # cases list which would take multiple GPUs, most cases would be two GPUs
+    is_exclusive=''           # indicate whether the case is exclusive type
+    is_multicard=''           # indicate whether the case is multiple GPUs type
+set +x
+
+    while read -r line; do
+        if [[ "$line" == "" ]]; then
+            continue
+        fi
+            read matchstr <<< $(echo "$line"|grep -oEi 'Test[ \t]+#')
+            if [[ "$matchstr" == "" ]]; then
+                # Any test case with LABELS property would be parse here
+                # RUN_TYPE=EXCLUSIVE mean the case would run exclusively
+                # RUN_TYPE=DIST mean the case would take two graph GPUs during runtime
+                read is_exclusive <<< $(echo "$line"|grep -oEi "RUN_TYPE=EXCLUSIVE")
+                read is_multicard <<< $(echo "$line"|grep -oEi "RUN_TYPE=DIST")
+                continue
+            fi
+            read testcase <<< $(echo "$line"|grep -oEi "\w+$")
+
+            if [[ "$is_multicard" == "" ]]; then
+                # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
+                read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_")
+            fi
+
+            if [[ "$is_exclusive" != "" ]]; then
+                if [[ "$exclusive_tests" == "" ]]; then
+                    exclusive_tests="^$testcase$"
+                else
+                    exclusive_tests="$exclusive_tests|^$testcase$"
+                fi
+            elif [[ "$is_multicard" != "" ]]; then
+                if [[ "$multiple_card_tests" == "" ]]; then
+                    multiple_card_tests="^$testcase$"
+                else
+                    multiple_card_tests="$multiple_card_tests|^$testcase$"
+                fi
+            else
+                if [[ "${single_card_tests}" -gt 3000 ]];then
+                    if [[ "$single_card_tests_1" == "" ]]; then
+                        single_card_tests_1="^$testcase$"
+                    else
+                        single_card_tests_1="$single_card_tests_1|^$testcase$"
+                    fi
+                    continue
+                fi
+                if [[ "$single_card_tests" == "" ]]; then
+                    single_card_tests="^$testcase$"
+                else
+                    single_card_tests="$single_card_tests|^$testcase$"
+                fi
+            fi
+            is_exclusive=''
+            is_multicard=''
+            is_nightly=''
+            matchstr=''
+            testcase=''
+    done <<< "$test_cases";
+
+set -x
+    mkdir -p ${PADDLE_ROOT}/build/ut_map
+    mkdir -p ${PADDLE_ROOT}/build/pytest
+
+    precise_card_test_single "$single_card_tests" 1
+    precise_card_test_single "$single_card_tests_1" 1
+    precise_card_test_single "$multiple_card_tests" 2
+    precise_card_test_single "$exclusive_tests"
+
+    python ${PADDLE_ROOT}/tools/get_ut_file_map.py 'get_not_success_ut' ${PADDLE_ROOT}
+    
+    if [[ -f "${PADDLE_ROOT}/build/utNotSuccess" ]]; then
+        rerun_tests=`cat ${PADDLE_ROOT}/build/utNotSuccess`
+        precise_card_test_single "$rerun_tests"
+    fi
+    wait;
+
+    #generate python coverage and generate python file to tests_map_file
+    python ${PADDLE_ROOT}/tools/pyCov_multithreading.py ${PADDLE_ROOT}
+
+    #analy h/cu to Map file
+    python ${PADDLE_ROOT}/tools/handle_h_cu_file.py 'analy_h_cu_file' $tmp_dir ${PADDLE_ROOT}
+
+    #generate ut map
+    python ${PADDLE_ROOT}/tools/get_ut_file_map.py 'get_ut_map' ${PADDLE_ROOT}
+    wait;
+}
+
+
+
 function parallel_test_base_xpu() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -1454,11 +1623,11 @@ set -x
 }
 
 function parallel_test() {
-    ut_total_startTime_s=`date +%s`
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
+    ut_total_startTime_s=`date +%s`
     if [ "$WITH_GPU" == "ON" ] || [ "$WITH_ROCM" == "ON" ];then
         parallel_test_base_gpu
     else
@@ -1986,6 +2155,12 @@ function main() {
         check_coverage
         check_change_of_unittest ${PYTHON_ABI:-""}
         ;;
+      ci_preciseTest)
+        insert_pile_to_h_cu_diff 
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        enable_unused_var_check
+        get_precise_tests_map_file
+        ;;
       cicheck_brpc)
         cmake_gen ${PYTHON_ABI:-""}
         build ${parallel_number}
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index ba4c3b09f9ff7fff6824cec9e15679330e28734e..4f3a6f4768933d90782445edbc74f4f446a15a9b 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -25,6 +25,7 @@ from ..fluid.data_feeder import check_type
 from ..fluid.data_feeder import check_dtype
 from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
+from ..fluid.dygraph import layers
 from ..fluid.dygraph.parallel import prepare_context
 import paddle
 from .fleet import fleet
@@ -98,6 +99,13 @@ class Group():
         else:
             return -1
 
+    def __repr__(self):
+        debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format(
+            self.rank, self.nranks, self.id)
+        debug_str += ", ".join(map(str, self.ranks))
+        debug_str += ". "
+        return debug_str
+
 
 _global_env = None
 
@@ -875,6 +883,86 @@ def _mp_allreduce(tensor,
         raise NotImplementedError("No support _mp_allreduce in dygraph mode.")
 
 
+class _Linear(layers.Layer):
+    """
+    Linear
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(_Linear, self).__init__()
+        self._dtype = self._helper.get_default_dtype()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self.weight = self.create_parameter(
+            shape=[in_features, out_features],
+            attr=self._weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.bias = self.create_parameter(
+            shape=[out_features],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.name = name
+
+    def forward(self, input):
+        out = _linear(
+            x=input, weight=self.weight, bias=self.bias, name=self.name)
+        return out
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self.name) if self.name else ''
+        return 'in_features={}, out_features={}, dtype={}{}'.format(
+            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str)
+
+
+def _linear(x, weight, bias=None, name=None):
+    """
+    Fuction Linear
+    """
+    if in_dygraph_mode():
+        pre_bias = _varbase_creator(dtype=x.dtype)
+        core.ops.matmul(x, weight, pre_bias, 'transpose_X', False,
+                        'transpose_Y', False, "alpha", 1)
+        return dygraph_utils._append_bias_in_dygraph(
+            pre_bias, bias, axis=len(x.shape) - 1)
+    else:
+        helper = LayerHelper('linear', **locals())
+        dtype = x.dtype
+        assert len(
+            x.shape) < 4, "X latitude is not supported greater than 3 now."
+
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'linear')
+        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear')
+
+        inputs = {'X': [x], 'Y': [weight]}
+        attrs = {
+            'transpose_X': False,
+            'transpose_Y': False,
+            'alpha': 1,
+        }
+        tmp = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='matmul_v2', inputs=inputs, outputs={'Out': tmp}, attrs=attrs)
+        if bias is not None:
+            res = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [tmp],
+                        'Y': [bias]},
+                outputs={'Out': [res]},
+                attrs={'axis': len(x.shape) - 1})
+        else:
+            res = tmp
+        return res
+
+
 def _parallel_linear(x,
                      num_rows,
                      num_cols,
@@ -889,6 +977,11 @@ def _parallel_linear(x,
                      group=None):
     """
     Parallel Linear
+
+    axis the dimension of the parameter of linear layer. 
+    axis = 0: the row dimension
+    axid = 1: the col dimension
+    
     """
     if group is not None and not group.is_member():
         return
@@ -900,18 +993,32 @@ def _parallel_linear(x,
     else:
         x = _c_identity(x, group=group)
 
-    linear = paddle.nn.Linear(
-        num_rows,
-        num_cols,
-        weight_attr=param_attr,
-        bias_attr=bias_attr,
-        name=name)
+    if core.is_compiled_with_npu():
+        linear = _Linear(
+            num_rows,
+            num_cols,
+            weight_attr=param_attr,
+            bias_attr=bias_attr,
+            name=name)
+    else:
+        linear = paddle.nn.Linear(
+            num_rows,
+            num_cols,
+            weight_attr=param_attr,
+            bias_attr=bias_attr,
+            name=name)
 
     linear_out = linear(x)
     startup_block = paddle.static.default_startup_program().global_block()
     main_block = paddle.static.default_main_program().global_block()
     startup_block.vars[linear.weight.name].is_distributed = True
     main_block.vars[linear.weight.name].is_distributed = True
+    # set is_distributed for splited bias
+    # if a linear layer is splited by row, each rank would hold a complete bias and they should be the same in each rank.
+    # if a linear layer is splited by col, the bias would also be split into each rank as its weight
+    if axis == 1 and linear._bias_attr != False:
+        startup_block.vars[linear.bias.name].is_distributed = True
+        main_block.vars[linear.bias.name].is_distributed = True
 
     if not gather_out: return linear_out
 
diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py
index 708c76ac55abe83fcb5553bd769fba4c9f579882..27437c50fad66a3438c2b6bab46e30631d2e93bd 100644
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -74,10 +74,17 @@ def _get_ascend_rankfile(rank_table_file_path):
     device_count = 0
     server_list = json_data['server_list']
     for server in server_list:
-        node_ips.append(server['server_id'])
         device_list = server['device']
         device_count = len(device_list)
-
+        if os.getenv("FLAGS_MODELARTS", None):
+            nodes = os.getenv("DLS_TASK_NUMBER", None)
+            assert nodes is not None, "DLS_TASK_NUMBER didn't set!"
+            for node in range(int(nodes)):
+                node_ip = os.getenv(f"VC_CUSTOM{node}_HOSTS", None)
+                assert node_ip is not None, f"VC_CUSTOM{node}_HOSTS didn't set!"
+                node_ips.append(node_ip)
+            return node_ips, device_count
+        node_ips.append(server['server_id'])
     return node_ips, device_count
 
 
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
old mode 100755
new mode 100644
index 122ef4357af72619f562ce879ce2970891d961d0..0a989fe90f96a6f44659070658e2bc3c4fd8d5c9
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -814,7 +814,7 @@ class DistributedStrategy(object):
                 "sharding_segment_strategy": "segment_broadcast_MB",
                 "segment_broadcast_MB": 32,
                 "sharding_degree": 8,
-                "sharding_degree": 2,
+                "dp_degree": 2,
                 "gradient_merge_acc_step": 4,
                 }
         """
@@ -949,6 +949,8 @@ class DistributedStrategy(object):
         **Notes**:
             **Detailed arguments for tensor_parallel_configs**
             **tensor_parallel_degree**: degree of tensor parallel
+            **tensor_init_seed**: parameter initialization random seed
+
 
         Examples:
 
@@ -957,7 +959,8 @@ class DistributedStrategy(object):
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.tensor_parallel = True
-            strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4}
+            strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4,
+                                                "tensor_init_seed": 123}
 
         """
         return get_msg_dict(self.strategy.tensor_parallel_configs)
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 15ee047b1aaee1a31fd8c854376802ce66ddad14..5e883f1ac6cc91fcc8c7ececf2df15cdf856aed6 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -17,6 +17,7 @@ import copy
 import warnings
 import paddle
 import os
+import numpy as np
 from paddle.fluid.framework import dygraph_only
 from paddle.fluid import compiler
 from .role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker, RoleMakerBase
@@ -28,7 +29,7 @@ from paddle.fluid.wrapped_decorator import wrap_decorator
 from paddle.fluid.dygraph import parallel_helper
 from . import topology as tp
 from .topology import ParallelMode
-from ..meta_parallel import ModelParallel
+from ..meta_parallel import TensorParallel, model_parallel_random_seed
 from ..meta_parallel import PipelineParallel
 from ..meta_optimizers import HybridParallelOptimizer
 from ..meta_optimizers import HybridParallelGradScaler
@@ -279,6 +280,14 @@ class Fleet(object):
 
         self._hcg = tp.HybridCommunicateGroup(self._topology)
 
+        if self.mp_degree > 1:
+            tensor_parallel_configs = self._user_defined_strategy.tensor_parallel_configs
+            tensor_init_seed = tensor_parallel_configs["tensor_init_seed"]
+            if tensor_init_seed == -1:
+                model_parallel_random_seed()
+            else:
+                model_parallel_random_seed(tensor_init_seed)
+
     def get_hybrid_communicate_group(self):
         assert self._hcg is not None
         return self._hcg
@@ -829,8 +838,8 @@ class Fleet(object):
                 last_comm_group_size_MB,
                 find_unused_parameters=self._user_defined_strategy.
                 find_unused_parameters)
-        elif self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL:
-            distributed_model = ModelParallel(
+        elif self._hcg.get_parallel_mode() == ParallelMode.TENSOR_PARALLEL:
+            distributed_model = TensorParallel(
                 model, self._hcg, strategy=self._user_defined_strategy)
         elif self._hcg.get_parallel_mode() == ParallelMode.PIPELINE_PARALLEL:
             distributed_model = PipelineParallel(
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index 470a4d83aac3fedb2135d44567fe31688894b093..04d8417fdcbf3f1ef23db09caf1cc417672b8358 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -28,7 +28,7 @@ _HYBRID_PARALLEL_GROUP = None
 
 class ParallelMode(object):
     DATA_PARALLEL = 0
-    MODEL_PARALLEL = 1
+    TENSOR_PARALLEL = 1
     PIPELINE_PARALLEL = 2
 
 
@@ -155,12 +155,12 @@ class HybridCommunicateGroup(object):
         _HYBRID_PARALLEL_GROUP = self
 
     def get_parallel_mode(self):
-        # there are three modes : DataParallel / ModelParallel / PipelineParallel
+        # there are three modes : DataParallel / TensorParallel / PipelineParallel
         if self._mp_degree == 1 and self._pp_degree == 1:
             return ParallelMode.DATA_PARALLEL
         elif self._mp_degree > 1 and self._pp_degree == 1:
             # initialize the seed
-            return ParallelMode.MODEL_PARALLEL
+            return ParallelMode.TENSOR_PARALLEL
         elif self._pp_degree > 1:
             return ParallelMode.PIPELINE_PARALLEL
 
@@ -253,3 +253,8 @@ class HybridCommunicateGroup(object):
     # check parallel group
     def get_check_parallel_group(self):
         return self._check_comm_group
+
+    def get_rank_from_stage(self, stage_id):
+        coord = self._topo.get_coord(self.global_rank)
+        tf = coord._replace(pipe=stage_id)._asdict()
+        return self._topo.get_rank(**tf)
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index be7ad257ccb99ccc1775c0a73cfbe8443b8454cf..c69b21538b61ad207db385afa99cbbc1448a2b71 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import functools
 import logging
-import socket
 import time
 import os
 import signal
@@ -27,6 +25,7 @@ from contextlib import closing
 import socket
 import warnings
 import six
+import struct
 
 import paddle
 import paddle.fluid as fluid
@@ -362,6 +361,10 @@ def add_arguments(argname, type, default, help, argparser, **kwargs):
 def find_free_ports(num):
     def __free_port():
         with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            # Note(wangxi): Close the connection with a TCP RST instead
+            # of a TCP FIN, to avoid time_wait state.
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER,
+                         struct.pack('ii', 1, 0))
             s.bind(('', 0))
             return s.getsockname()[1]
 
@@ -376,7 +379,7 @@ def find_free_ports(num):
             return port_set
 
         step += 1
-        if step > 100:
+        if step > 400:
             print(
                 "can't find avilable port and use the specified static port now!"
             )
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
index d0e8034f5cae152f9f8f38b62ad802d1ae8dcf4f..c0f671e7e446bcb749ab9fec120b905eb570506e 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -31,7 +31,7 @@ class HybridParallelGradScaler:
         self._scaler = scaler
         self._hcg = hcg
         self._is_mp = (
-            self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL)
+            self._hcg.get_parallel_mode() == ParallelMode.TENSOR_PARALLEL)
 
     def scale(self, var):
         return self._scaler.scale(var)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index b7ac298d2223eea23b203de7299064e200e7282e..c2d79a62c7663a01d5cd1e7ca9ac705612e1db03 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -89,13 +89,15 @@ class HybridParallelOptimizer:
         self._inner_opt = optimizer
         self._strategy = strategy
         self._hcg = hcg
-        self._is_mp = (
-            self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL)
+
+        self._use_dp_mode = (
+            self._hcg.get_parallel_mode() == ParallelMode.DATA_PARALLEL)
+
         self._need_dp = (self._hcg.get_data_parallel_world_size() > 1)
 
         if isinstance(self._inner_opt._grad_clip,
-                      ClipGradByGlobalNorm) and self._is_mp:
-            logger.warning("using ClipGradByGlobalNorm in ModelParallel, the origin " \
+                      ClipGradByGlobalNorm) and not self._use_dp_mode:
+            logger.warning("using ClipGradByGlobalNorm in TensorParallel, the origin " \
                   "optmizer'grad clip will be changed.")
             self._inner_opt._grad_clip = HybridParallelClipGrad(
                 self._inner_opt._grad_clip, hcg)
@@ -103,7 +105,7 @@ class HybridParallelOptimizer:
     @imperative_base.no_grad
     @framework.dygraph_only
     def step(self):
-        if self._is_mp and self._need_dp:
+        if not self._use_dp_mode and self._need_dp:
             fused_allreduce_gradients(
                 list(self._inner_opt._parameter_list), self._hcg)
         self._inner_opt.step()
@@ -119,7 +121,7 @@ class HybridParallelOptimizer:
         parameter_list = parameters if parameters \
             else self._parameter_list
 
-        if self._is_mp and self._need_dp:
+        if not self._use_dp_mode and self._need_dp:
             fused_allreduce_gradients(list(parameter_list), self._hcg)
 
         return self._inner_opt.minimize(loss, startup_program, parameters,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index 4194cf13d2bbcdeada650735738d7ba8e1847cd2..22ed3f2ac416032c8deba402527dc7ed381c0acf 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -63,9 +63,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         trainer_endpoints_env = ",".join(trainer_endpoints)
         trainers_num = self.role_maker._worker_num()
 
-        # FIXME(wangxi): approve this.
-        #if trainer_id == 0:
-        #    wait_server_ready(other_trainers)
+        # NOTE(wangxi): npu don't need to wait server ready
+        if trainer_id == 0 and not paddle.is_compiled_with_npu():
+            wait_server_ready(other_trainers)
 
         if core.is_compiled_with_cuda():
             comm_id_var = startup_program.global_block().create_var(
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index f4ceb2d287a56c7b955817263f751e32dbf23e77..ca3606c16e5d4740f90c78f455f7c15f307f9f8c 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -402,13 +402,18 @@ def get_grad_device(grad_name, shard):
     return shard.global_param2device[base_name]
 
 
-def get_first_check_finite_and_unscale_op_idx(block):
+def get_first_check_finite_and_unscale_op_idx(block, raise_error=True):
 
     for idx, op in enumerate(block.ops):
         if op.type == "check_finite_and_unscale":
             return idx
 
-    raise ValueError("check_finite_and_unscale does not exist in block")
+    if raise_error:
+        raise ValueError(
+            "amp is turned on but check_finite_and_unscale op does not exist in main block"
+        )
+
+    return -1
 
 
 def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 82e54a89e104ffa4e0a36ff796c0af07e04c883b..aafb15e0a01f8521854a9dd9843dfa5ad3e39e8c 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -298,7 +298,7 @@ class ShardingOptimizer(MetaOptimizerBase):
                 print("persistable FP32 grad: ")
                 print(accumulated_grad_names)
                 first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
-                    main_block)
+                    main_block, raise_error=self.user_defined_strategy.amp)
                 insert_reduce_ops(
                     main_block,
                     first_optimize_op_index,
@@ -309,14 +309,15 @@ class ShardingOptimizer(MetaOptimizerBase):
                     use_calc_stream=True)
             if self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp":
                 first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
-                    main_block)
-                insert_allreduce_ops(
-                    main_block,
-                    first_optimize_op_index,
-                    self.dp_ring_id,
-                    accumulated_grad_names,
-                    core.op_proto_and_checker_maker.OpRole.Optimize,
-                    use_calc_stream=True)
+                    main_block, raise_error=self.user_defined_strategy.amp)
+                if first_optimize_op_index >= 0:
+                    insert_allreduce_ops(
+                        main_block,
+                        first_optimize_op_index,
+                        self.dp_ring_id,
+                        accumulated_grad_names,
+                        core.op_proto_and_checker_maker.OpRole.Optimize,
+                        use_calc_stream=True)
 
         # if not use sharding, adapt amp/clip, for remain parallelism.
         # cast --> amp --> clip --> opt
diff --git a/python/paddle/distributed/fleet/meta_parallel/__init__.py b/python/paddle/distributed/fleet/meta_parallel/__init__.py
index ed74d8e744e50d0533bce09d71947107a15f2296..894771a3d5005f9b803d4a5842d266a0247c9279 100644
--- a/python/paddle/distributed/fleet/meta_parallel/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/__init__.py
@@ -20,7 +20,7 @@ from .parallel_layers import PipelineLayer  # noqa: F401
 from .parallel_layers import RNGStatesTracker  # noqa: F401
 from .parallel_layers import model_parallel_random_seed  # noqa: F401
 from .parallel_layers import get_rng_state_tracker  # noqa: F401
-from .model_parallel import ModelParallel  # noqa: F401
+from .tensor_parallel import TensorParallel  # noqa: F401
 from .pipeline_parallel import PipelineParallel  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
index af59b16e22aa858e3b1e244c0bf7169957dfa2fd..730a7430133e0658be87ed5c6d3f570400ff3ea9 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -41,6 +41,7 @@ class VocabParallelEmbedding(Layer):
         self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
 
         self.origin_num_embeddings = num_embeddings
+        self.is_mp = (self.world_size > 1)
 
         per_part_size = (
             num_embeddings + self.world_size - 1) // self.world_size
@@ -50,16 +51,36 @@ class VocabParallelEmbedding(Layer):
         per_part_size += 1  # make the last row as the padding index
         self.per_part_size = per_part_size
 
-        self.embedding = paddle.nn.Embedding(
-            per_part_size,
-            embedding_dim,
-            padding_idx=per_part_size - 1,
-            sparse=False,
-            weight_attr=weight_attr,
-            name=name)
-        self.embedding.weight.is_distributed = True
+        self._dtype = self._helper.get_default_dtype()
+        self._size = [per_part_size, embedding_dim]
+        self._weight_attr = weight_attr
+        self._name = name
+
+        if self.is_mp:
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    attr=self._weight_attr,
+                    shape=self._size,
+                    dtype=self._dtype,
+                    is_bias=False)
+            self.weight[per_part_size - 1] = 0.0
+            self.weight.is_distributed = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=[num_embeddings, embedding_dim],
+                dtype=self._dtype,
+                is_bias=False)
 
     def forward(self, x):
+        if not self.is_mp:
+            return F.embedding(
+                x,
+                weight=self.weight,
+                padding_idx=None,
+                sparse=False,
+                name=self._name)
+
         origin_input_shape = x.shape
         if len(origin_input_shape) == 2:
             x = paddle.unsqueeze(x, axis=-1)
@@ -72,13 +93,18 @@ class VocabParallelEmbedding(Layer):
         if len(origin_input_shape) == 2:
             x_shard = paddle.squeeze(x_shard, axis=-1)
 
-        emb_out = self.embedding(x_shard)
-        if self.world_size > 1:
-            emb_out = paddle.distributed.collective._mp_allreduce(
-                emb_out,
-                group=self.model_parallel_group,
-                use_calc_stream=True,
-                use_model_parallel=True)
+        emb_out = F.embedding(
+            x_shard,
+            weight=self.weight,
+            padding_idx=self.per_part_size - 1,
+            sparse=False,
+            name=self._name)
+
+        emb_out = paddle.distributed.collective._mp_allreduce(
+            emb_out,
+            group=self.model_parallel_group,
+            use_calc_stream=True,
+            use_model_parallel=True)
         return emb_out
 
 
@@ -96,8 +122,9 @@ class ColumnParallelLinear(Layer):
         )
         self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
         )
+        self._name = name
+        self.is_mp = (self.world_size > 1)
 
-        self.name = name
         self.gather_output = gather_output
         assert out_features % self.world_size == 0, (
             "Number of column of the weight for linear ({}) must be"
@@ -108,10 +135,20 @@ class ColumnParallelLinear(Layer):
         self._weight_attr = weight_attr
         self._dtype = self._helper.get_default_dtype()
 
-        self.weight = self.create_parameter(
-            shape=[in_features, self.output_size_per_partition],
-            attr=self._weight_attr,
-            dtype=self._dtype)
+        if self.is_mp:
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    shape=[in_features, self.output_size_per_partition],
+                    attr=self._weight_attr,
+                    dtype=self._dtype,
+                    is_bias=False)
+        else:
+            self.weight = self.create_parameter(
+                shape=[in_features, self.output_size_per_partition],
+                attr=self._weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+
         self.weight.is_distributed = True
 
         if has_bias:
@@ -119,18 +156,24 @@ class ColumnParallelLinear(Layer):
             self.bias = self.create_parameter(
                 shape=[self.output_size_per_partition],
                 attr=paddle.nn.initializer.Constant(value=0.0),
-                dtype=self._dtype)
+                dtype=self._dtype,
+                is_bias=True)
             self.bias.is_distributed = True
         else:
             self.bias = None
 
     def forward(self, x):
         # use inner api to process identity
-        input_parallel = paddle.distributed.collective._c_identity(
-            x, group=self.model_parallel_group)
+        if self.is_mp:
+            input_parallel = paddle.distributed.collective._c_identity(
+                x, group=self.model_parallel_group)
+        else:
+            input_parallel = x
+
         output_parallel = F.linear(
-            input_parallel, self.weight, self.bias, name=self.name)
-        if self.gather_output:
+            input_parallel, self.weight, self.bias, name=self._name)
+
+        if self.gather_output and self.is_mp:
             output = paddle.distributed.collective._c_concat(
                 output_parallel,
                 nranks=self.world_size,
@@ -155,7 +198,7 @@ class RowParallelLinear(Layer):
         self.input_is_parallel = input_is_parallel
         self._weight_attr = weight_attr
         self._dtype = self._helper.get_default_dtype()
-        self.name = name
+        self._name = name
 
         self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
         )
@@ -163,6 +206,7 @@ class RowParallelLinear(Layer):
         )
         self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
 
+        self.is_mp = (self.world_size > 1)
         assert in_features % self.world_size == 0, (
             "Number of row of the weight for linear ({}) must be"
             " divisible by model parallel size ({})".format(in_features,
@@ -170,22 +214,33 @@ class RowParallelLinear(Layer):
 
         self.input_size_per_partition = in_features // self.world_size
 
-        self.weight = self.create_parameter(
-            shape=[self.input_size_per_partition, self.out_features],
-            attr=self._weight_attr,
-            dtype=self._dtype)
+        if self.is_mp:
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    shape=[self.input_size_per_partition, self.out_features],
+                    attr=self._weight_attr,
+                    dtype=self._dtype,
+                    is_bias=False)
+        else:
+            self.weight = self.create_parameter(
+                shape=[self.input_size_per_partition, self.out_features],
+                attr=self._weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+
         self.weight.is_distributed = True
 
         if has_bias:
             self.bias = self.create_parameter(
                 shape=[self.out_features],
                 attr=paddle.nn.initializer.Constant(value=0.0),
-                dtype=self._dtype)
+                dtype=self._dtype,
+                is_bias=True)
         else:
             self.bias = None
 
     def forward(self, x):
-        if self.input_is_parallel:
+        if self.input_is_parallel or (not self.is_mp):
             input_parallel = x
         else:
             # split last dim
@@ -195,12 +250,16 @@ class RowParallelLinear(Layer):
                 nranks=self.world_size,
                 group=self.model_parallel_group)
 
-        output_parallel = F.linear(input_parallel, self.weight, name=self.name)
-        output_ = paddle.distributed.collective._mp_allreduce(
-            output_parallel,
-            group=self.model_parallel_group,
-            use_calc_stream=True,
-            use_model_parallel=True)
+        output_parallel = F.linear(input_parallel, self.weight, name=self._name)
+
+        if self.is_mp:
+            output_ = paddle.distributed.collective._mp_allreduce(
+                output_parallel,
+                group=self.model_parallel_group,
+                use_calc_stream=True,
+                use_model_parallel=True)
+        else:
+            output_ = output_parallel
 
         output = output_ + self.bias if self.bias is not None else output_
         return output
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
index 41c9deabd1e11b92d0c0b522130e8fc82a05f095..70daa3b25365e4be07d38a0e848976143b3793a0 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
@@ -14,6 +14,7 @@
 
 import paddle
 import contextlib
+import numpy as np
 
 __all__ = []
 
@@ -65,14 +66,18 @@ def get_rng_state_tracker():
     return RNG_STATE_TRACKER
 
 
-def model_parallel_random_seed(seed=2048):
+def model_parallel_random_seed(seed=None):
     import paddle.distributed.fleet as fleet
     hcg = fleet.get_hybrid_communicate_group()
     rank = hcg.get_model_parallel_rank()
 
-    local_seed = seed + 1024 + rank
-    global_seed = seed
+    if seed:
+        global_seed = seed
+        local_seed = seed * 1024 + rank * 100
+    else:
+        global_seed = np.random.randint(0, 655350)
+        local_seed = np.random.randint(rank * 10000, (rank + 1) * 10000 - 1)
 
     RNG_STATE_TRACKER.reset()
-    paddle.seed(global_seed)
     RNG_STATE_TRACKER.add(MODEL_PARALLEL_RNG, local_seed)
+    paddle.seed(global_seed)
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index 79e5bc2ffeda06d62b24aec2e10ae3ad071d856a..54324b389336d0f423a79e177a04a0381d2779b9 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -11,39 +11,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-import time
-import copy
-import os
-
 from types import MethodType
 
-from numpy import prod
-
 import paddle
 import paddle.fluid as fluid
 from .meta_parallel_base import MetaParallelBase
-from .pp_utils.utils import get_tensor_bytes, is_float_tensor
+from .pp_utils.utils import is_float_tensor, get_tensor_dtype, paddle_2_number, number_2_dtype
 from .pp_utils import utils
 from .parallel_layers.pp_layers import PipelineLayer
 
 from ..utils.hybrid_parallel_util import broadcast_mp_parameters
 from ..utils.hybrid_parallel_util import broadcast_dp_parameters
-from ..utils.hybrid_parallel_util import fused_allreduce_gradients
 from ..utils.log_util import logger
+from ..meta_optimizers.dygraph_optimizer import HybridParallelOptimizer
 
 __all__ = []
 
-FLOAT_TYPES = [
-    paddle.float16,
-    paddle.float32,
-    paddle.float64,
-]
-
 
 class PipelineParallel(MetaParallelBase):
     def __init__(self, layers, hcg, strategy):
+        if not isinstance(layers, PipelineLayer):
+            raise TypeError(
+                "The Layer should be a derived class of PipelineLayer.")
         super(PipelineParallel, self).__init__(layers, hcg, strategy)
-
         self.use_pipe_parallel = self._hcg.get_pipe_parallel_world_size() > 1
         self.use_data_parallel = self._hcg.get_data_parallel_world_size() > 1
         self.use_model_parallel = self._hcg.get_model_parallel_world_size() > 1
@@ -63,8 +53,6 @@ class PipelineParallel(MetaParallelBase):
         self.current_loss = paddle.to_tensor(0.0)
         self.total_loss = None
 
-        self.use_amp = self._strategy.amp
-        self.init_loss_scaling = self._strategy.amp_configs['init_loss_scaling']
         self.micro_batch_size = self._strategy.pipeline_configs[
             'micro_batch_size']
         self.accumulate_steps = self._strategy.pipeline_configs[
@@ -75,6 +63,11 @@ class PipelineParallel(MetaParallelBase):
         self.prev_stage_id = self.stage_id - 1
         self.next_stage_id = self.stage_id + 1
         self.pp_group = self._hcg.get_pipe_parallel_group()
+
+        self.is_first_stage = self.stage_id == 0
+        self.is_last_stage = (self.stage_id == (self.num_stages - 1))
+        self.global_rank = self._hcg.get_global_rank()
+
         logger.info("Pipeline Info -- num_stages: {}, stage_id: {}".format(
             self.num_stages, self.stage_id))
 
@@ -83,51 +76,72 @@ class PipelineParallel(MetaParallelBase):
             broadcast_mp_parameters(self._layers, self._hcg)
 
         if self.use_data_parallel:
-            logger.info("start broadcast mp parameters")
+            logger.info("start broadcast dp parameters")
             broadcast_dp_parameters(self._layers, self._hcg)
 
-    def _allocate_caches(self, num_caches):
+    def _init_caches(self, num_caches):
         if self.num_caches >= num_caches:
             return
-
-        num = num_caches - self.num_caches
-        self.num_caches = num_caches
+        self.num_caches = num_caches - self.num_caches
         for key in self.caches:
-            self.caches[key].extend([None] * num)
+            self.caches[key].extend([None] * self.num_caches)
+
+    def _reduce_final_loss(self):
+        if self.is_last_stage:
+            assert self.total_loss is not None, "train_batch() in last stage should obtain vaild loss"
+            loss = self.total_loss.clone() / self.accumulate_steps
+            paddle.distributed.broadcast(
+                loss,
+                src=self.global_rank,
+                use_calc_stream=True,
+                group=self.pp_group)
+        else:
+            loss = paddle.to_tensor(0.0)
+            paddle.distributed.broadcast(
+                loss,
+                src=self._hcg.get_rank_from_stage(self.num_stages - 1),
+                use_calc_stream=True,
+                group=self.pp_group)
+        return loss
 
-    def train_batch(self, data, optimizer):
+    def train_batch(self, data, optimizer, lr_scheduler=None):
+        assert isinstance(optimizer, HybridParallelOptimizer), (
+            'optimizer should be HybridParallelOptimizer subclass.')
         self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
         assert fluid.framework._dygraph_tracer()._has_grad, (
             'Please enable the generation of gradients.')
 
-        if self.stage_id == 0 or self.stage_id == self.num_stages - 1:
-            assert data, (
+        if self.is_first_stage or self.is_last_stage:
+            assert data is not None, (
                 "For the first and the last stage, the data_iter must be set.")
         else:
-            assert data is None, (
-                "For pipe stages other than the first and the last one, "
-                "the data_iter must be None.")
+            data = None
+
         self.data = data
         self._layers.train()
-        self.total_loss = None
-
-        minibatch_cmds = utils.TrainGenerator(self.accumulate_steps,
-                                              self.num_stages, self.stage_id)
-        self._train(minibatch_cmds)
-        return self.total_loss
 
-    def _train(self, minibatch_cmds):
-        self._allocate_caches(self.accumulate_steps)
-        for micro_cmds in minibatch_cmds:
-            for cmd in micro_cmds:
-                assert type(cmd) in self._COMMAND_MAP, "unknow cmd: {}".format(
-                    type(cmd))
-                self._apply_cmd = MethodType(self._COMMAND_MAP[type(cmd)], self)
-                self._apply_cmd(**cmd.kwargs)
-
-    def _allreduce_grads(self):
-        if not self.use_data_parallel: return
-        fused_allreduce_gradients(list(self._layers.parameters()), self._hcg)
+        # store total loss of entire batch
+        self.total_loss = None
+        self._init_caches(self.accumulate_steps)
+        startup_steps = self.num_stages - self.stage_id - 1
+        forward_steps = 0
+        backward_steps = 0
+
+        # forward
+        while (forward_steps < self.accumulate_steps):
+            self._forward(cache_id=forward_steps)
+            forward_steps += 1
+
+        # backward
+        while (backward_steps < self.accumulate_steps):
+            self._backward(cache_id=backward_steps)
+            backward_steps += 1
+
+        # optimizer
+        self._step()
+        self.train_loss = self._reduce_final_loss()
+        return self.train_loss
 
     def _forward(self, cache_id):
         # load data
@@ -140,16 +154,17 @@ class PipelineParallel(MetaParallelBase):
         else:
             inputs = self.caches['inputs'][cache_id]
 
-        self._clear_grads(inputs)
         outputs = self._layers.forward(inputs)
+        self._clear_grads(inputs)
+
         self.caches['outputs'][cache_id] = outputs
 
-        if self.stage_id == self.num_stages - 1:
+        if self.is_last_stage:
             if self._layers._loss_fn is not None:
                 labels = self.caches['labels'][cache_id]
                 outputs = self._layers._loss_fn(outputs, labels)
 
-        if self.stage_id == self.num_stages - 1:
+        if self.is_last_stage:
             self.current_loss = outputs
             if isinstance(self.current_loss, paddle.Tensor):
                 if self.total_loss is None:
@@ -162,18 +177,17 @@ class PipelineParallel(MetaParallelBase):
                     ]
                 for idx, v in enumerate(self.current_loss):
                     self.total_loss[idx] += v.detach()
-            if self.use_data_parallel:
-                self.current_loss = self.current_loss / self._hcg.get_data_parallel_world_size(
-                )
+
             if self.accumulate_steps > 1:
                 self.current_loss = self.current_loss / self.accumulate_steps
+
             self.caches['outputs'][cache_id] = self.current_loss.clone()
+
         else:
             self._send_activations(cache_id)
 
     def _backward(self, cache_id):
-        assert self.optimizer is not None
-        if self.stage_id == self.num_stages - 1:
+        if self.is_last_stage:
             paddle.autograd.backward(self.caches['outputs'][cache_id])
             self._send_gradients(cache_id)
             return
@@ -194,92 +208,89 @@ class PipelineParallel(MetaParallelBase):
         grad_tensors = None
         if self.stage_id != 0: self._send_gradients(cache_id)
         self.caches['outputs'][cache_id] = None
-        #self.caches['backward_tensors'][cache_id] = None
 
-    def _get_data(self):
-        if self.use_model_parallel:
-            mp_rank = self._hcg.get_model_parallel_rank()
+    def _broadcast_data(self, data):
+        if isinstance(data, paddle.Tensor):
+            paddle.distributed.broadcast(
+                data,
+                src=self._hcg.get_model_parallel_group_src_rank(),
+                group=self._hcg.get_model_parallel_group())
         else:
-            mp_rank = 0
-
-        # mp rank 0 loads the data and broadcat it to others.
-        data = self.data
-        if self.use_model_parallel and (self.stage_id == 0 or
-                                        self.stage_id == self.num_stages - 1):
-            assert isinstance(data, (tuple, paddle.Tensor))
-            if isinstance(data, paddle.Tensor):
+            for d in data:
+                assert isinstance(d, paddle.Tensor)
                 paddle.distributed.broadcast(
-                    data,
+                    d,
                     src=self._hcg.get_model_parallel_group_src_rank(),
                     group=self._hcg.get_model_parallel_group())
-            else:
-                data = []
-                for d in self.data:
-                    assert isinstance(d, paddle.Tensor)
-                    paddle.distributed.broadcast(
-                        d,
-                        src=self._hcg.get_model_parallel_group_src_rank(),
-                        group=self._hcg.get_model_parallel_group())
-                    data.append(d)
-            data = tuple(data)
         return data
 
     def _load_micro_batch(self, cache_id):
-        inputs = self._get_data()
-
-        if self.stage_id == 0:
-            data = None
-            #if isinstance(inputs[0], paddle.Tensor):
-            if len(inputs) == 1:
-                assert isinstance(inputs[0], paddle.Tensor)
-                data = inputs[0].clone().detach()
-                #data.stop_gradient = not is_float_tensor(data)
-                data.stop_gradient = True
+        inputs = self.data
+        begin = cache_id * self.micro_batch_size
+        end = begin + self.micro_batch_size
+
+        if self.is_first_stage:
+            assert len(inputs) == 2, "length of input should be 2"
+            if self.use_model_parallel:
+                inputs[0] = self._broadcast_data(inputs[0])
+            if isinstance(inputs[0], tuple):
+                batch_size = inputs[0][0].shape[0]
+                assert self.micro_batch_size * self.accumulate_steps == batch_size, (
+                    "batch_size needs to be divisible by micro_batch_size. Currently, "
+                    "batch_size = %d, micro_batch_size = %d, accumulate_steps = %d."
+                    %
+                    (batch_size, self.micro_batch_size, self.accumulate_steps))
+                data = [
+                    input[begin:end, :].clone().detach() for input in inputs[0]
+                ]
+                self.caches['inputs'][cache_id] = tuple(data)
+            else:
+                batch_size = inputs[0].shape[0]
+                assert self.micro_batch_size * self.accumulate_steps == batch_size
+                self.caches['inputs'][cache_id] = inputs[0][begin:end, :].clone(
+                ).detach()
+        elif self.is_last_stage:
+            assert len(inputs) == 2, "length of input should be 2"
+            if self.use_model_parallel:
+                inputs[1] = self._broadcast_data(inputs[1])
+            if isinstance(inputs[1], tuple):
+                batch_size = inputs[1][0].shape[0]
+                assert self.micro_batch_size * self.accumulate_steps == batch_size
+                data = [
+                    input[begin:end, :].clone().detach() for input in inputs[1]
+                ]
+                self.caches['labels'][cache_id] = tuple(data)
             else:
-                assert isinstance(inputs, tuple)
-                data = []
-                for d in inputs:
-                    assert isinstance(d, paddle.Tensor)
-                    i = d.clone().detach()
-                    #i.stop_gradient = not is_float_tensor(i)
-                    i.stop_gradient = True
-                    data.append(i)
-                data = tuple(data)
-            self.caches['inputs'][cache_id] = data
-
-        if self.stage_id == self.num_stages - 1:
-            labels = None
-            #if isinstance(inputs[1], paddle.Tensor):
-            if len(inputs) == 1:
-                assert isinstance(inputs[0], paddle.Tensor)
-                labels = inputs[0]
-            elif isinstance(inputs, tuple):
-                labels = []
-                for label in inputs:
-                    assert isinstance(label, paddle.Tensor)
-                    label = label.detach()
-                    labels.append(label)
-                labels = tuple(labels)
-            self.caches['labels'][cache_id] = labels
+                batch_size = inputs[1].shape[0]
+                assert self.micro_batch_size * self.accumulate_steps == batch_size
+                self.caches['labels'][cache_id] = inputs[1][begin:end, :].clone(
+                ).detach()
+        else:
+            # No data input is required for other stages
+            inputs = None
 
     def _send_meta(self, data, peer):
-        """
-        % type (0: tensor, 1: tuple)
-        % num_tensors if type=tuple
-        foreach tensor:
-          % ndims
-          % shape
-        """
         if isinstance(data, paddle.Tensor):
             tensor_type = paddle.to_tensor([0])
+            # send tensor type
             paddle.distributed.send(
                 tensor_type, peer, use_calc_stream=True, group=self.pp_group)
+
+            # send len(shape)
             dims = paddle.to_tensor(len(data.shape))
             paddle.distributed.send(
                 dims, peer, use_calc_stream=True, group=self.pp_group)
+
+            # send shape
             shape = paddle.to_tensor(data.shape)
             paddle.distributed.send(
                 shape, peer, use_calc_stream=True, group=self.pp_group)
+
+            # send dtype
+            dtype = paddle.to_tensor(paddle_2_number(data.dtype))
+            paddle.distributed.send(
+                dtype, peer, use_calc_stream=True, group=self.pp_group)
+
         elif isinstance(data, tuple):
             tensor_type = paddle.to_tensor([1])
             paddle.distributed.send(
@@ -289,48 +300,73 @@ class PipelineParallel(MetaParallelBase):
                 nums, peer, use_calc_stream=True, group=self.pp_group)
             for idx, d in enumerate(data):
                 assert isinstance(d, paddle.Tensor)
+                # send len(shape)
                 dims = paddle.to_tensor(len(d.shape))
                 paddle.distributed.send(
                     dims, peer, use_calc_stream=True, group=self.pp_group)
+
+                # send shape
                 shape = paddle.to_tensor(d.shape)
                 paddle.distributed.send(
                     shape, peer, use_calc_stream=True, group=self.pp_group)
 
+                # send dtype
+                dtype = paddle.to_tensor(paddle_2_number(d.dtype))
+                paddle.distributed.send(
+                    dtype, peer, use_calc_stream=True, group=self.pp_group)
+
     def _recv_meta(self, peer):
         tensor_type = paddle.to_tensor([0])
         paddle.distributed.recv(
             tensor_type, peer, use_calc_stream=True, group=self.pp_group)
-        tensor_type = tensor_type.numpy()[0]
+        tensor_type = tensor_type.item()
 
         if tensor_type == 0:
+            # recv len(shape)
             dims = paddle.to_tensor([0])
             paddle.distributed.recv(
                 dims, peer, use_calc_stream=True, group=self.pp_group)
-            dims = dims.numpy()[0]
+            dims = dims.item()
+
+            # recv shape
             shape = paddle.to_tensor([0] * dims)
             paddle.distributed.recv(
                 shape, peer, use_calc_stream=True, group=self.pp_group)
             shape = shape.numpy().tolist()
-            return self._allocate_buffer(
-                shape, dtype="float32", num_caches=1)[0]
+
+            # recv dtype
+            dtype = paddle.to_tensor([0])
+            paddle.distributed.recv(
+                dtype, peer, use_calc_stream=True, group=self.pp_group)
+            return self._allocate_cache(
+                shape, dtype=number_2_dtype(dtype.item()), num_caches=1)[0]
         elif tensor_type == 1:
             num = paddle.to_tensor([0])
             paddle.distributed.recv(
                 num, peer, use_calc_stream=True, group=self.pp_group)
-            num = num.numpy()[0]
+            num = num.item()
             shapes = []
+            dtypes = []
             for i in range(num):
+                # recv len(shape)
                 dims = paddle.to_tensor([0])
                 paddle.distributed.recv(
                     dims, peer, use_calc_stream=True, group=self.pp_group)
-                dims = dims.numpy()[0]
+
+                # recv shape
+                dims = dims.item()
                 shape = paddle.to_tensor([0] * dims)
                 paddle.distributed.recv(
                     shape, peer, use_calc_stream=True, group=self.pp_group)
                 shapes.append(shape.numpy().tolist())
 
-            dtypes = ["float32"] * len(shapes)
-            caches = self._allocate_buffers(shapes, dtypes, num_caches=1)[0]
+                # recv dtype
+                dtype = paddle.to_tensor([0])
+                paddle.distributed.recv(
+                    dtype, peer, use_calc_stream=True, group=self.pp_group)
+                dtypes.append(number_2_dtype(dtype.item()))
+
+            caches = self._allocate_caches(shapes, dtypes, num_caches=1)[0]
             caches = tuple(caches)
             return caches
 
@@ -357,7 +393,6 @@ class PipelineParallel(MetaParallelBase):
 
     def _send_gradients(self, cache_id):
         inputs = self.caches['inputs'][cache_id]
-
         if isinstance(inputs, paddle.Tensor):
             assert inputs.grad is not None
             paddle.distributed.send(
@@ -371,7 +406,6 @@ class PipelineParallel(MetaParallelBase):
                 if not is_float_tensor(d):
                     assert d.grad is None
                     continue
-                assert d.grad is not None
                 paddle.distributed.send(
                     d.grad,
                     self.prev_stage_id,
@@ -381,8 +415,6 @@ class PipelineParallel(MetaParallelBase):
 
     def _recv_activations(self, cache_id):
         inputs = None
-
-        # Allocate the buffer if necessary
         if self.recv_cache is None:
             self.recv_cache = self._recv_meta(self.prev_stage_id)
 
@@ -419,14 +451,16 @@ class PipelineParallel(MetaParallelBase):
         if self.grad_tensors is None:
             if isinstance(outputs, paddle.Tensor):
                 s = list(outputs.shape)
-                dtype = 'float16' if self.use_amp else "float32"
-                self.grad_tensors = self._allocate_buffer(
-                    s, dtype, num_buffers=1)[0]
+                dtype = get_tensor_dtype(outputs.dtype)
+                self.grad_tensors = self._allocate_cache(
+                    s, dtype, num_caches=1)[0]
             else:
                 sizes = [list(d.shape) for d in outputs if is_float_tensor(d)]
-                dtypes = ['float16'] * len(
-                    sizes) if self.use_amp else ['float32'] * len(sizes)
-                self.grad_tensors = self._allocate_buffers(
+                dtypes = [
+                    get_tensor_dtype(d.dtype) for d in outputs
+                    if is_float_tensor(d)
+                ]
+                self.grad_tensors = self._allocate_caches(
                     sizes, dtypes, num_caches=1)[0]
 
         if isinstance(self.grad_tensors, paddle.Tensor):
@@ -445,9 +479,10 @@ class PipelineParallel(MetaParallelBase):
                     group=self.pp_group)
 
     def _step(self):
-        self._allreduce_grads()
         self.optimizer.step()
-        self.optimizer.clear_gradients()
+        self.optimizer.clear_grad()
+        if self.lr_scheduler:
+            self.lr_scheduler.step()
 
     def _clear_grads(self, inputs):
         if isinstance(inputs, paddle.Tensor):
@@ -461,7 +496,7 @@ class PipelineParallel(MetaParallelBase):
     def _allocate_zeros(self, shape, dtype):
         return paddle.zeros(shape, dtype)
 
-    def _allocate_buffer(self, shape, dtype, num_caches=-1):
+    def _allocate_cache(self, shape, dtype, num_caches=-1):
         caches = []
         if num_caches == -1:
             num_caches = self.num_caches
@@ -469,7 +504,7 @@ class PipelineParallel(MetaParallelBase):
             caches.append(self._allocate_zeros(shape, dtype))
         return caches
 
-    def _allocate_buffers(self, shapes, dtypes, num_caches=-1):
+    def _allocate_caches(self, shapes, dtypes, num_caches=-1):
         caches = []
         if num_caches == -1:
             num_caches = self.num_caches
@@ -488,11 +523,5 @@ class PipelineParallel(MetaParallelBase):
         state_dict = paddle.load(self.model_path)
         self._layers.set_state_dict(state_dict)
 
-    _COMMAND_MAP = {
-        utils.Optimize: _step,
-        utils.Forward: _forward,
-        utils.Backward: _backward,
-    }
-
     def forward(self, *inputs, **kwargs):
         raise RuntimeError("Call train_batch for pipeline instead of forward.")
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
index e5c5709f98d9577d742ee7eabac259459eef79b1..8c204820b16615db10968928da2c7c1867b0e6bf 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -14,20 +14,51 @@
 
 import abc
 import paddle
-from ...utils import hybrid_parallel_util as hp_util
+from ...utils import log_util as hp_util
 
 __all__ = []
 
-FLOAT_TYPES = [
-    paddle.float16,
-    paddle.float32,
-    paddle.float64,
-]
+FLOAT_TYPE_DICT = {
+    paddle.float16: "float16",
+    paddle.float32: "float32",
+    paddle.float64: "float64",
+}
+
+PADDLE_TO_NUMBER = {
+    paddle.float16: 0,
+    paddle.float32: 1,
+    paddle.float64: 2,
+    paddle.int32: 3,
+    paddle.int64: 4
+}
+
+NUMBER_TO_DTYPE = {
+    0: "float16",
+    1: "float32",
+    2: "float64",
+    3: "int32",
+    4: "int64"
+}
 
 
 def is_float_tensor(tensor):
     """Is a float tensor"""
-    return tensor.dtype in FLOAT_TYPES
+    return tensor.dtype in FLOAT_TYPE_DICT.keys()
+
+
+def get_tensor_dtype(dtype):
+    assert dtype in FLOAT_TYPE_DICT.keys()
+    return FLOAT_TYPE_DICT[dtype]
+
+
+def paddle_2_number(dtype):
+    assert dtype in PADDLE_TO_NUMBER.keys()
+    return PADDLE_TO_NUMBER[dtype]
+
+
+def number_2_dtype(number):
+    assert number in NUMBER_TO_DTYPE.keys()
+    return NUMBER_TO_DTYPE[number]
 
 
 def get_tensor_bytes(tensor):
@@ -48,78 +79,3 @@ def get_tensor_bytes(tensor):
     else:
         raise ValueError("unknown data type: {}".format(tensor.dtype))
     return tensor.numel() * elem_size
-
-
-class Generator():
-    def __init__(self, micro_batches, stages, stage_id):
-        __metaclass__ = abc.ABCMeta
-
-        self.micro_batches = micro_batches
-        self.stages = stages
-        self.stage_id = stage_id
-        self.prev_stage = self.stage_id - 1
-        self.next_stage = self.stage_id + 1
-
-    @abc.abstractmethod
-    def generate(self):
-        pass
-
-    def __iter__(self):
-        self.iter = None
-        return self
-
-    def __next__(self):
-        if self.iter is None:
-            self.iter = self.generate()
-        return next(self.iter)
-
-
-class TrainGenerator(Generator):
-    def generate(self):
-        startup_steps = self.stages - self.stage_id - 1
-        cmds = []
-        forward_steps = 0
-        backward_steps = 0
-        #while (forward_steps < startup_steps):
-        #    cmds.append(Forward(cache_id=forward_steps))
-        #    forward_steps += 1
-        #while (forward_steps < self.micro_batches):
-        #    cmds.append(Forward(cache_id=forward_steps))
-        #    forward_steps += 1
-        #    cmds.append(Backward(cache_id=backward_steps))
-        #    backward_steps += 1
-        #while (backward_steps < self.micro_batches):
-        #    cmds.append(Backward(cache_id=backward_steps))
-        #    backward_steps += 1
-        #cmds.append(Optimize())
-        while (forward_steps < self.micro_batches):
-            cmds.append(Forward(cache_id=forward_steps))
-            forward_steps += 1
-        while (backward_steps < self.micro_batches):
-            cmds.append(Backward(cache_id=backward_steps))
-            backward_steps += 1
-        cmds.append(Optimize())
-        yield cmds
-
-
-class Command:
-    def __init__(self, **kwargs):
-        self.name = self.__class__.__name__
-        self.kwargs = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-    def __repr__(self):
-        return hp_util.call_to_str(self.name, **self.kwargs)
-
-
-class Optimize(Command):
-    pass
-
-
-class Forward(Command):
-    pass
-
-
-class Backward(Command):
-    pass
diff --git a/python/paddle/distributed/fleet/meta_parallel/model_parallel.py b/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
similarity index 89%
rename from python/paddle/distributed/fleet/meta_parallel/model_parallel.py
rename to python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
index 682d7152a42bd2f2d20e759d4ef5ddb3f0edec67..1dbf668d6e13a01c29e14b6687106683af0e9d97 100644
--- a/python/paddle/distributed/fleet/meta_parallel/model_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
@@ -22,15 +22,15 @@ from ..utils.log_util import logger
 __all__ = []
 
 
-class ModelParallel(MetaParallelBase):
+class TensorParallel(MetaParallelBase):
     def __init__(self, layers, hcg, **kwargs):
-        super(ModelParallel, self).__init__(layers, hcg, **kwargs)
+        super(TensorParallel, self).__init__(layers, hcg, **kwargs)
 
     def _prepare_for_model(self):
         logger.info("start broadcast mp parameters")
         broadcast_mp_parameters(self._layers, self._hcg)
 
-        logger.info("start broadcast mp parameters")
+        logger.info("start broadcast dp parameters")
         broadcast_dp_parameters(self._layers, self._hcg)
 
         logger.info("mp's parameters is ready")
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index d31fa549ad562370bf8669c919d6842a6562a09e..f18b82eaecd76a70bad4dec9aaf77dba34b2c158 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -847,8 +847,6 @@ class TheOnePSRuntime(RuntimeBase):
         dirname = os.path.normpath(dirname)
         pserver_id = self.role_maker._role_id()
 
-        import time
-        begin = time.time()
         for var_name in load_varnames:
             table_id = sparse_table_maps[var_name]
             path = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX,
@@ -856,9 +854,6 @@ class TheOnePSRuntime(RuntimeBase):
             meta = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX,
                                 "{}.block{}.meta".format(var_name, pserver_id))
             self._server.load_sparse(path, meta, table_id)
-        end = time.time()
-        print("init sparse variables: {} cost time: {}".format(load_varnames,
-                                                               end - begin))
 
     def _run_server(self):
         if self.role_maker._is_heter_worker():
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index 5521bd5b95283744b3ced90508a37f983ba4bcf1..ddbd6111b460994f93adc90751c339db0c808234 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -44,7 +44,15 @@ def _apply_collective_grads(parameters, comm_group):
 
     for coalesced_grad, _, _ in coalesced_grads_and_vars:
         # need to div nranks
-        coalesced_grad = coalesced_grad / comm_group.nranks
+        div_factor = paddle.to_tensor(
+            comm_group.nranks, dtype=coalesced_grad.dtype)
+        paddle.fluid.framework._dygraph_tracer().trace_op(
+            type="elementwise_div",
+            inputs={'X': coalesced_grad,
+                    'Y': div_factor},
+            outputs={'Out': coalesced_grad},
+            attrs={'axis': -1})
+
         paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
 
     _split_tensors(coalesced_grads_and_vars)
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index 30981f531289aefffb46f94dff9ed77c0804b253..0221a42e2a3e78685fd17af902354b7f4609df6a 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -1,4 +1,5 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,6 +33,8 @@ from .mixed_precision import *
 from . import layers
 from .layers import *
 from . import optimizer
+from . import sparsity
+from .sparsity import *
 
 __all__ = []
 __all__ += decoder.__all__
@@ -42,3 +45,4 @@ __all__ += extend_optimizer.__all__
 __all__ += ['mixed_precision']
 __all__ += layers.__all__
 __all__ += optimizer.__all__
+__all__ += sparsity.__all__
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index f940f6a3143a09fa82d4e10fba38f7d86b9c025d..2913d99ee6b217c64b5ed0fdec76ada3fbe90aa2 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -145,6 +145,7 @@ gray_list = {
     'sign',
     'cast',
     'fused_bn_add_activation',
+    'c_identity',
 }
 
 # The set of ops that don't support fp16 calculation
diff --git a/python/paddle/fluid/contrib/slim/tests/README.md b/python/paddle/fluid/contrib/slim/tests/README.md
index 169cb686168f8cf343dc3ee52adc5519da4fb8ab..8688c96b7bd4724540b843748d23455710918854 100644
--- a/python/paddle/fluid/contrib/slim/tests/README.md
+++ b/python/paddle/fluid/contrib/slim/tests/README.md
@@ -207,13 +207,29 @@ Run the following commands to download and extract Quant model:
 ```bash
 mkdir -p /PATH/TO/DOWNLOAD/MODEL/
 cd /PATH/TO/DOWNLOAD/MODEL/
-export QUANT_MODEL_NAME=resnet50
-export QUANT_MODEL_ARCHIVE=${QUANT_MODEL_NAME}_quant.tar.gz
-wget http://paddle-inference-dist.bj.bcebos.com/int8/QAT2_models/${QUANT_MODEL_ARCHIVE}
+export QUANT_MODEL_NAME=ResNet50
+export QUANT_MODEL_ARCHIVE=${QUANT_MODEL_NAME}_qat_model.tar.gz
+wget http://paddle-inference-dist.bj.bcebos.com/int8/QAT_models/${QUANT_MODEL_ARCHIVE}
 mkdir ${QUANT_MODEL_NAME} && tar -xvf ${QUANT_MODEL_ARCHIVE} -C ${QUANT_MODEL_NAME}
 ```
 
-To download other Quant models, set the `QUANT_MODEL_NAME` variable in the above commands to one of the values: `resnet101`, `mobilenetv1`, `mobilenetv2`, `vgg16`, `vgg19`.
+To download other Quant models, set the `QUANT_MODEL_NAME` variable in the above commands to one of the values: `ResNet101`, `MobileNetV1`, `MobileNetV2`, `VGG16`, `VGG19`.
+
+Moreover, there are other variations of these Quant models that use different methods to obtain scales during training, run these commands to download and extract Quant model:
+
+```bash
+mkdir -p /PATH/TO/DOWNLOAD/MODEL/
+cd /PATH/TO/DOWNLOAD/MODEL/
+export QUANT_MODEL_NAME=ResNet50_qat_perf
+export QUANT_MODEL_ARCHIVE=${QUANT_MODEL_NAME}.tar.gz
+wget http://paddle-inference-dist.bj.bcebos.com/int8/QAT_models/${QUANT_MODEL_ARCHIVE}
+mkdir ${QUANT_MODEL_NAME} && tar -xvf ${QUANT_MODEL_ARCHIVE} -C ${QUANT_MODEL_NAME}
+```
+
+To download other Quant models, set the `QUANT_MODEL_NAME` variable to on of the values: `ResNet50_qat_perf`, `ResNet50_qat_range`, `ResNet50_qat_channelwise`, `MobileNet_qat_perf`, where:
+- `ResNet50_qat_perf`, `MobileNet_qat_perf` with input/output scales in `fake_quantize_moving_average_abs_max` operators, with weight scales in `fake_dequantize_max_abs` operators
+- `ResNet50_qat_range`, with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes, with weight scales in `fake_dequantize_max_abs` operators
+- `ResNet50_qat_channelwise`, with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes, with weight scales in `fake_channel_wise_dequantize_max_abs` operators
 
 Download clean FP32 model for accuracy comparison against the INT8 model:
 
diff --git a/python/paddle/fluid/contrib/sparsity/__init__.py b/python/paddle/fluid/contrib/sparsity/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f78ea1b1c38b85b04ab0e09757ec4d6eea5eaf4d
--- /dev/null
+++ b/python/paddle/fluid/contrib/sparsity/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import utils
+from .utils import *
+
+__all__ = utils.__all__
diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1108c327407ff65596156668edd864715291894
--- /dev/null
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -0,0 +1,587 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities of Auto SParsity (ASP).
+"""
+
+from __future__ import print_function
+
+import sys
+import math
+import collections
+import numpy as np
+from enum import Enum
+from itertools import permutations
+import threading
+
+__all__ = [
+    'density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
+    'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
+    'MaskAlgo', 'CheckMethod'
+]
+
+
+class MaskAlgo(Enum):
+    r"""
+    A collection of all mask generating algorithms.
+    There currently are three algorithms, `MASK_1D`, `MASK_2D_GREEDY` and `MASK_2D_BEST`
+    """
+    MASK_1D = 'get_mask_1d'
+    MASK_2D_GREEDY = 'get_mask_2d_greedy'
+    MASK_2D_BEST = 'get_mask_2d_best'
+
+
+class CheckMethod(Enum):
+    r"""
+    A collection of all sparsity checking approaches.
+    There currently are two methods, `CHECK_1D` and `CHECK_2D`
+    """
+    CHECK_1D = 'check_mask_1d'
+    CHECK_2D = 'check_mask_2d'
+
+    @staticmethod
+    def get_checking_method(mask_algo):
+        r"""
+        Get sparsity checking method by mask generating algorithm.
+
+        Args:
+            mask_algo (MaskAlgo): The algorithm of mask generating.
+        Returns:
+            CheckMethod: The corresponded sparsity checking method.
+        Examples:
+            .. code-block:: python
+
+            import numpy as np
+            from paddle.fluid.contrib.sparsity import MaskAlgo, CheckMethod
+
+            CheckMethod.get_checking_method(MaskAlgo.MASK_1D)
+            # CheckMethod.CHECK_1D
+
+            CheckMethod.get_checking_method(MaskAlgo.MASK_2D_GREEDY)
+            # CheckMethod.CHECK_2D
+
+            CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST)
+            # CheckMethod.CHECK_2D
+        """
+        assert type(mask_algo) == MaskAlgo, \
+               "mask_algo should be MaskAlgo type"
+        if mask_algo == MaskAlgo.MASK_1D:
+            return CheckMethod.CHECK_1D
+        else:
+            return CheckMethod.CHECK_2D
+
+
+def density(x):
+    r"""
+    Return the density of the input tensor.
+
+    Args:
+        x (nparray): The input tensor.
+    Returns:
+        float: The density of :attr:`x`.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          x = np.array([[0, 1, 3, 0],
+                        [1, 1, 0, 1]])
+          sparsity.density(x) # 0.625
+    """
+    x_flattened = x.flatten()
+    return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
+
+
+def reshape_1d(mat, m):
+    r"""
+    Reshape the input matrix to shape (-1, m).
+    If the second dimension of :attr:`mat` is not a multiples of :attr:`m`, 
+    then this function would pad the remainder with 0 before reshaping.
+
+    .. math::
+
+        remainder = mat.shape[1] % m
+
+    Args:
+        mat (nparray): The input matrix.
+        m (int): The second dimension of reshaped matrix.
+    Returns:
+        tuple: A pair of the reshaped and padded matrix and the shape of padded matrix (non-reshaping).
+    """
+    remainder = mat.shape[1] % m
+    if mat.shape[1] % m > 0:
+        mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder)))
+        mat_padded[:, :mat.shape[1]] = mat
+        shape = mat_padded.shape
+        return mat_padded.reshape(-1, m), shape
+    else:
+        return mat.reshape(-1, m), mat.shape
+
+
+def check_mask_1d(mat, n, m):
+    r"""
+    Check if every row of the input matrix :attr:`mat` is in 1D `n:m` sparse pattern.
+    This function would pad the second dimension of :attr:`mat` by zero 
+    to be a multiples of :attr:`m` if necessary.
+
+    1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block.
+
+    Args:
+        mat (nparray): The input matrix.
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+    Returns:
+        bool: True if every row of :attr:`mat` is in 1D n:m sparse pattern, else False.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          x = np.array([[0, 1, 3, 0],
+                        [1, 0, 0, 1]])
+          sparsity.check_mask_1d(x, 2, 4) # True
+
+          x = np.array([[0, 1, 5, 4],
+                        [1, 0, 0, 1]])
+          sparsity.check_mask_1d(x, 2, 4) # False
+
+          # x would be padded to shape (2, 8)
+          x = np.array([[0, 1, 0, 4, 6],
+                        [1, 0, 0, 1, 7]])
+          sparsity.check_mask_1d(x, 2, 4) # True
+    """
+    if len(mat.shape) <= 1:
+        mat_flattern, shape = reshape_1d(mat.reshape(1, mat.shape[0]), m)
+    else:
+        mat_flattern, shape = reshape_1d(mat, m)
+
+    for sub_mat in mat_flattern:
+        if np.nonzero(sub_mat)[0].size > (m - n):
+            return False
+    return True
+
+
+def get_mask_1d(mat, n, m):
+    r"""
+    Generate 1D `n:m` sparse pattern mask of the input matrix :attr:`mat` 
+    in row-directory. This function would pad the second dimension of :attr:`mat` 
+    by zero to be a multiples of :attr:`m` before mask generation.
+
+    1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block.
+
+    Args:
+        mat (nparray): The input matrix.
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+    Returns:
+        nparray: The 1D `n:m` sparse mask of :attr:`mat`.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          mat = np.array([[0, 1, 5, 4],
+                          [2, 7, 3, 6]])
+          mask = sparsity.get_mask_1d(mat, 2, 4)
+          # nparray([[0, 0, 1, 1],
+          #          [0, 1, 0, 1]])
+          sparsity.check_mask_1d(mask, 2, 4) # True
+    """
+    mat_flattern, shape = reshape_1d(mat, m)
+
+    mask_flattern = np.ones_like(mat_flattern)
+    mask = np.ones_like(mat)
+    for i in range(mat_flattern.shape[0]):
+        sub_mat = mat_flattern[i]
+        min_order_indices = np.argsort(np.absolute(sub_mat))
+        mask_flattern[i, min_order_indices[:n].tolist()] = 0
+    mask_flattern = mask_flattern.reshape(shape)
+    mask[:, :] = mask_flattern[:, :mat.shape[1]]
+    return mask
+
+
+def reshape_2d(mat, m):
+    r"""
+    Reshape the input matrix to shape (-1, :math:`m \times m`).
+    In each dimension of :attr:`mat`, if it is not a multiples of :attr:`m`, 
+    then this function would pad the remainder with 0 before reshaping.
+
+    .. math::
+
+        remainder_0 = mat.shape[0] % m \\
+        remainder_1 = mat.shape[1] % m
+
+    Args:
+        mat (nparray): The input matrix.
+        m (int): The square root of second dimension of reshaped matrix.
+    Returns:
+        tuple: A pair of the reshaped and padded matrix and the shape of padded matrix (non-reshaping).
+    """
+    remainder_0 = mat.shape[0] % m
+    remainder_1 = mat.shape[1] % m
+
+    new_shape = (mat.shape[0] if remainder_0 == 0 \
+                 else mat.shape[0] + (m - remainder_0),
+                 mat.shape[1] if remainder_1 == 0 \
+                 else mat.shape[1] + (m - remainder_1))
+    mat_padded = np.zeros(new_shape)
+    mat_padded[:mat.shape[0], :mat.shape[1]] = mat
+
+    mat_flattern = np.empty(new_shape).reshape(-1, m * m)
+    curr_idx = 0
+    for row_start in range(0, mat_padded.shape[0], m):
+        row_end = row_start + m
+        for col_start in range(0, mat_padded.shape[1], m):
+            col_end = col_start + m
+            sub_mat = np.squeeze(mat_padded[row_start:row_end, \
+                                            col_start:col_end] \
+                                            .reshape(-1))
+            mat_flattern[curr_idx] = sub_mat
+            curr_idx += 1
+    return mat_flattern, mat_padded.shape
+
+
+def check_mask_2d(mat, n, m):
+    r"""
+    Check if every :math:`m \times m` block of the input matrix :attr:`mat` is in 2D `n:m` sparse pattern.
+    This function would pad each dimension of :attr:`mat` by zero to be a multiples of 
+    :attr:`m` if necessary.
+
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    under the constraint of at least :attr:`n` zeros for each row and column.
+
+    Args:
+        mat (nparray): The input matrix.
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+    Returns:
+        bool: True if  every :math:`m \times m` block of the input matrix :attr:`mat` is in 2D `n:m` sparse pattern, else False.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          x = np.array([[0, 8, 9, 0],
+                        [9, 0, 0, 10],
+                        [5, 0, 0, 6],
+                        [0, 4, 6, 0]])
+          sparsity.check_mask_2d(x, 2, 4) # True
+
+          x = np.array([[0, 8, 0, 9],
+                        [9, 0, 0, 10],
+                        [0, 5, 0, 6],
+                        [0, 4, 6, 0]])
+          sparsity.check_mask_2d(x, 2, 4) # False
+
+          # x would be padded to shape (8, 8)
+          x = np.array([[0, 8, 0, 9],
+                        [9, 0, 7, 0],
+                        [0, 5, 0, 6],
+                        [3, 0, 6, 0],
+                        [1, 1, 0, 1]])
+          sparsity.check_mask_2d(x, 2, 4) # True
+    """
+    mat_padded, shape = reshape_2d(mat, m)
+    for sub_mat in mat_padded:
+        sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0
+        if (np.sum(np.sum(sub_mask, axis=1) > (m-n)) != 0) and \
+            (np.sum(np.sum(sub_mask, axis=0) > (m-n)) != 0):
+            return False
+    return True
+
+
+def get_mask_2d_greedy(mat, n, m):
+    r"""
+    Greedily generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`. 
+    This function would pad each dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation.
+
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    under the constraint of at least :attr:`n` zeros for each row and column.
+    Greedily generating: For each :math:`m \times m` block, selecting values to keep in descent order.
+
+    Args:
+        mat (nparray): The input matrix.
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+    Returns:
+        nparray: The 2D `n:m` sparse mask of :attr:`mat`.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          mat = np.array([[9, 8, 3, 7],
+                          [9, 2, 1, 10],
+                          [5, 1, 3, 6],
+                          [2, 4, 6, 1]])
+          mask = sparsity.get_mask_2d_greedy(mat, 2, 4)
+          # nparray([[1. 1. 0. 0.]
+          #          [1. 0. 0. 1.]
+          #          [0. 0. 1. 1.]
+          #          [0. 1. 1. 0.]])
+          sparsity.check_mask_2d(mask, 2, 4) # True
+    """
+    mat_padded, shape = reshape_2d(mat, m)
+    mask_padded = np.zeros_like(mat_padded).reshape(-1, m, m)
+
+    for idx in range(len(mat_padded)):
+        sub_mat = np.absolute(np.squeeze(mat_padded[idx]))
+        sub_mask = np.squeeze(mask_padded[idx])
+
+        min_order_1d_indices = np.argsort(sub_mat)
+        min_order_2d_indices = [(int(x / m), x % m)
+                                for x in min_order_1d_indices]
+        row_counter = collections.Counter()
+        col_counter = collections.Counter()
+
+        for i in range(len(min_order_1d_indices) - 1, -1, -1):
+            matrix_entry = min_order_2d_indices[i]
+            if (row_counter[matrix_entry[0]] == n) or \
+               (col_counter[matrix_entry[1]] == n):
+                continue
+
+            sub_mask[matrix_entry[0], matrix_entry[1]] = 1.0
+            row_counter[matrix_entry[0]] += 1
+            col_counter[matrix_entry[1]] += 1
+
+    mask = np.empty(shape)
+    curr_idx = 0
+    for row_start in range(0, shape[0], m):
+        row_end = row_start + m
+        for col_start in range(0, shape[1], m):
+            col_end = col_start + m
+            mask[row_start:row_end, col_start:col_end] = mask_padded[curr_idx]
+            curr_idx += 1
+    return mask[:mat.shape[0], :mat.shape[1]]
+
+
+valid_2d_patterns_lock = threading.Lock()
+valid_2d_patterns = {}
+
+
+def compute_valid_2d_patterns(n, m):
+    r"""
+    Compute all vaild 2D `n:m` sparse patterns.
+
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    under the constraint of at least :attr:`n` zeros for each row and column.
+
+    Args:
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+    Returns:
+        dictionary: A dictionary with key: *m_n* (string) and value: all vaild 2D `n:m` sparse patterns.
+    """
+    global valid_2d_patterns_lock
+    global valid_2d_patterns
+
+    valid_key = '{}_{}'.format(m, n)
+    if valid_key in valid_2d_patterns:
+        return valid_2d_patterns[valid_key]
+    else:
+        patterns = np.zeros(m)
+        patterns[:n] = 1
+        patterns = list(set(permutations(patterns.tolist())))
+        patterns = patterns + patterns
+        patterns = np.asarray(list(set(permutations(patterns, m))))
+
+        valid = ((patterns.sum(axis=1) <= n).sum(axis=1) == m
+                 ).nonzero()[0].reshape(-1)
+        valid_patterns = np.empty((valid.shape[0], m, m))
+        valid_patterns[:] = patterns[valid[:]]
+
+        valid_2d_patterns_lock.acquire()
+        valid_2d_patterns[valid_key] = valid_patterns
+        valid_2d_patterns_lock.release()
+
+        return valid_patterns
+
+
+def get_mask_2d_best(mat, n, m):
+    r"""
+    Generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat` 
+    to form sparse matrix with maximun L1 norm .This function would pad each 
+    dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation.
+
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    under the constraint of at least :attr:`n` zeros for each row and column.
+
+    *Note*: L1 norm of sparse matrix from `Best` API is greater than or equal to the one from `Greedy`.
+
+    Args:
+        mat (nparray): The input matrix.
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+    Returns:
+        nparray: The 1D `n:m` sparse mask of :attr:`mat`.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          mat = np.array([[2, 8, 9, 9],
+                          [9, 1, 3, 9],
+                          [5, 6, 3, 9],
+                          [2, 4, 6, 9]])
+          mask_greedy = sparsity.get_mask_2d_greedy(mat, 2, 4)
+          mask_greedy = sparsity.get_mask_2d_best(mat, 2, 4)
+          print("L1 norm of `greedy` sparse matrix", np.multiply(mat, mask_greedy).sum()) # 56
+          print("L1 norm of `best` sparse matrix", np.multiply(mat, mask_best).sum()) # 61
+    """
+    patterns = compute_valid_2d_patterns(n, m)
+
+    mat_flattern, shape = reshape_2d(mat, m)
+    mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
+    pmax = np.argmax(
+        np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
+        axis=1)
+
+    mask_flattern[:] = patterns[pmax[:]]
+    mask = np.empty(shape)
+
+    curr_idx = 0
+    for row_start in range(0, shape[0], m):
+        row_end = row_start + m
+        for col_start in range(0, shape[1], m):
+            col_end = col_start + m
+            mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx]
+            curr_idx += 1
+    return mask[:mat.shape[0], :mat.shape[1]]
+
+
+def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
+    r"""
+    Create `n:m` sparse pattern mask of the input tensor via function given by :attr:`func_name`.
+    Currently only support tensor with dimension less than or equal to 4.
+
+    Args:
+        tensor (nparray): The input tensor.
+        func_name (MaskAlgo, optional): The function name to generate spase mask. Default is `MaskAlgo.MASK_1D`. All options please refer to `MaskAlgo`.
+        n (int, optional): n of `n:m` sparse pattern. Default is 2.
+        m (int, optional): m of `n:m` sparse pattern. Default is 4.
+    Returns:
+        nparray: The `n:m` sparse mask of :attr:`tensor` generated by :attr:`func_name`.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          tensor = np.array([[2, 8, 9, 9],
+                             [9, 1, 3, 9],
+                             [5, 6, 3, 9],
+                             [2, 4, 6, 9]])
+          mask_1d = sparsity.create_mask(tensor, func_name=sparsity.MaskAlgo.MASK_1D)
+          # nparray([[0 0 1 1],
+          #          [1 0 0 1],
+          #          [0 1 0 1],
+          #          [0 0 1 1]])
+          mask_2d = sparsity.create_mask(tensor, func_name=sparsity.MaskAlgo.MASK_2D_BEST)
+          # nparray([[0 1 1 0],
+          #          [1 0 0 1],
+          #          [1 1 0 0],
+          #          [0 0 1 1]])
+    """
+    shape = tensor.shape
+    dtype = tensor.dtype
+    t = tensor.astype(float)
+
+    assert type(func_name) == MaskAlgo, \
+           "func_name argumet of create_mask is only accepted as type MaskAlgo. " \
+           "But got {}".format(type(func_name))
+    func = getattr(sys.modules[__name__], func_name.value, None)
+    if len(shape) == 1:
+        t = t.reshape(1, shape[0])
+        mask = func(t, n=n, m=m)
+        return mask.reshape(shape).astype(dtype)
+    elif len(shape) == 2:
+        t = t.reshape(shape[0], shape[1])
+        mask = func(t, n=n, m=m)
+        return mask.reshape(shape).astype(dtype)
+    elif len(shape) == 3:
+        t = t.reshape(shape[0] * shape[1], shape[2])
+        mask = func(t, n=n, m=m)
+        return mask.reshape(shape).astype(dtype)
+    # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op
+    elif len(shape) == 4:
+        t = t.reshape(shape[0], shape[1] * shape[2] * shape[3])
+        mask = func(t, n=n, m=m)
+        return mask.reshape(shape).astype(dtype)
+    else:
+        assert True, "The dimension of input tensor is not supported in create_mask, " \
+                     "Only dimension < 4 is supported but got {}".format(len(shape))
+
+
+def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
+    r"""
+    Check if input tensor is in `n:m` sparse pattern via function given by :attr:`func_name`.
+    Currently only support tensor with dimension less than or equal to 4.
+
+    Args:
+        tensor (nparray): The input tensor.
+        func_name (CheckMethod, optional): The function name to generate spase mask. Default is `CheckMethod.CHECK_1D`. All options please refer to `CheckMethod`.
+        n (int, optional): n of `n:m` sparse pattern. Default is 2.
+        m (int, optional): m of `n:m` sparse pattern. Default is 4.
+    Returns:
+        bool: True if tensor pass checking of function given by :attr:`func_name`, else False.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          tensor = np.array([[2, 8, 9, 9],
+                             [9, 1, 3, 9],
+                             [5, 6, 3, 9],
+                             [2, 4, 6, 9]])
+          mask_1d = sparsity.create_mask(tensor, func_name=sparsity.MaskAlgo.MASK_1D)
+          # nparray([[0 0 1 1],
+          #          [1 0 0 1],
+          #          [0 1 0 1],
+          #          [0 0 1 1]])
+          sparsity.check_sparsity(mask_1d, func_name=sparsity.CheckMethod.CHECK_1D) # True
+          sparsity.check_sparsity(mask_1d, func_name=sparsity.CheckMethod.CHECK_2D) # False
+    """
+    shape = tensor.shape
+    t = tensor.astype(float)
+
+    assert type(func_name) == CheckMethod, \
+           "func_name argumet of check_sparsity is only accepted as type CheckMethod. " \
+           "But got {}".format(type(func_name))
+    func = getattr(sys.modules[__name__], func_name.value, None)
+    if len(shape) == 1:
+        t = t.reshape(1, shape[0])
+        return func(t, n=n, m=m)
+    elif len(shape) == 2:
+        t = t.reshape(shape[0], shape[1])
+        return func(t, n=n, m=m)
+    elif len(shape) == 3:
+        t = t.reshape(shape[0] * shape[1], shape[2])
+        return func(t, n=n, m=m)
+    # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op
+    elif len(shape) == 4:
+        t = t.reshape(shape[0], shape[1] * shape[2] * shape[3])
+        return func(t, n=n, m=m)
+    else:
+        assert True, "The dimension of input tensor is not supported in check_sparsity, " \
+                     "Only dimension < 4 is supported but got {}".format(len(shape))
+
+    return False
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index b4cd3326ddec5f75fe93090fd2ef5ce12dc45771..2b9d5128560057eaf462ed21823992c244eff960 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -252,9 +252,11 @@ class DatasetBase(object):
                 slot_var.type = "float"
             elif var.dtype == core.VarDesc.VarType.INT64:
                 slot_var.type = "uint64"
+            elif var.dtype == core.VarDesc.VarType.INT32:
+                slot_var.type = "uint32"
             else:
                 raise ValueError(
-                    "Currently, fluid.dataset only supports dtype=float32 and dtype=int64"
+                    "Currently, fluid.dataset only supports dtype=float32, dtype=int32 and dtype=int64"
                 )
 
     def set_hdfs_config(self, fs_name, fs_ugi):
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index be5d9ac58311b58ba9125ee067c8b5e6edd18a95..c8e1370e44772fe208393ae38603dcfa89a5bc24 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -63,37 +63,52 @@ _functional_dygraph_context_manager = None
 
 @signature_safe_contextmanager
 def param_guard(parameters):
+    from paddle.fluid.dygraph.dygraph_to_static.program_translator import in_declarative_mode
     # Note: parameters is a reference of self._parameters or self._buffers
-    if not framework.in_dygraph_mode() and parameters:
+    if in_declarative_mode() and not framework.in_dygraph_mode() and parameters:
         origin_parameters = parameters.copy()
         for name, var_base in parameters.items():
-            if isinstance(var_base, core.VarBase):
-                # Convert ParamBase into Parameter with same attributes in dy2stat.
-                if isinstance(var_base, framework.ParamBase):
-                    new_var = var_base._to_static_var(to_parameter=True)
-                else:
-                    # Check whether has been created before.
-                    if var_base.name in var_base.block.vars:
-                        new_var = var_base.block.vars[var_base.name]
-                    # Note(Aurelius84): Convert VarBase in self._buffers into Variabe with
-                    # same attributes and set persistable=True to allow saving this var.
-                    # Because users can create a VarBase in `__init__`  like a
-                    # `mask` Tensor or `hidden_0` in RNN layers, which is equivalent to a Parameter
-                    # and necessary for inferring. It will be pruned if it's not necessary for inferring.
-                    else:
-                        # But if its shape is empty while created from `create_variable()`, we consider this buffer
-                        # non-persistable. See case of `drop_state` in lstm api.
-                        is_persistable = len(var_base.shape) > 0
-
-                        new_var = var_base._to_static_var(
-                            to_parameter=False, persistable=is_persistable)
-                parameters[name] = new_var
+            if isinstance(var_base, list):
+                new_var = [_convert_into_variable(var) for var in var_base]
+            else:
+                new_var = _convert_into_variable(var_base)
+            parameters[name] = new_var
         yield
         parameters.update(origin_parameters)
     else:
         yield
 
 
+def _convert_into_variable(var_base):
+    """
+    Convert Varbase into Variable.
+    """
+    if isinstance(var_base, core.VarBase):
+        # Check whether has been created before.
+        new_var = var_base.block._find_var_recursive(var_base.name)
+        if new_var is not None:
+            assert isinstance(new_var, framework.Variable)
+        # Convert ParamBase into Parameter with same attributes in dy2stat.
+        elif isinstance(var_base, framework.ParamBase):
+            new_var = var_base._to_static_var(to_parameter=True)
+        else:
+            # Note(Aurelius84): Convert VarBase in self._buffers into Variable with
+            # same attributes and set persistable=True to allow saving this var.
+            # Because users can create a VarBase in `__init__`  like a
+            # `mask` Tensor or `hidden_0` in RNN layers, which is equivalent to a Parameter
+            # and necessary for inferring. It will be pruned if it's not necessary for inferring.
+
+            # But if its shape is empty while created from `create_variable()`, we consider this buffer
+            # non-persistable. See case of `drop_state` in lstm api.
+            is_persistable = len(var_base.shape) > 0
+
+            new_var = var_base._to_static_var(
+                to_parameter=False, persistable=is_persistable)
+        return new_var
+    else:
+        return var_base
+
+
 def enabled():
     """
     This function checks whether the program runs in dynamic graph mode or not.
@@ -664,7 +679,7 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
         if isinstance(framework._current_expected_place(),
                       framework.core.CPUPlace):
             #TODO(zhiqiu): we found two problems when enable zero_copy on CPUPlace.
-            # (1): eigen requires 16-bytes alignments, but the data of numpy array may not statisfy. 
+            # (1): eigen requires 16-bytes alignments, but the data of numpy array may not statisfy.
             # Details: https://eigen.tuxfamily.org/dox/group__TopicUnalignedArrayAssert.html
             # (2): when used in flask framework, it may result in hang.
             # Details: https://github.com/PaddlePaddle/Paddle/issues/26635
diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py
index c7ea412fec1b77cf0dd86c187250e8ac499a800b..2938516e5bc442c6936d79ee7ccd9b66440afa15 100644
--- a/python/paddle/fluid/dygraph/container.py
+++ b/python/paddle/fluid/dygraph/container.py
@@ -15,6 +15,7 @@
 from collections import OrderedDict
 from ..framework import Parameter
 from .layers import Layer
+from .base import param_guard
 
 __all__ = [
     'Sequential',
@@ -159,7 +160,8 @@ class ParameterList(Layer):
                 self.add_parameter(str(idx), param)
 
     def __getitem__(self, idx):
-        return self._parameters[str(idx)]
+        with param_guard(self._parameters):
+            return self._parameters[str(idx)]
 
     def __setitem__(self, idx, param):
         assert isinstance(param, Parameter)
@@ -169,7 +171,8 @@ class ParameterList(Layer):
         return len(self._parameters)
 
     def __iter__(self):
-        return iter(self._parameters.values())
+        with param_guard(self._parameters):
+            return iter(self._parameters.values())
 
     def append(self, parameter):
         """Appends a given parameter at the end of the list.
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index 7604be2d838eb669c0f1af1f3a4c53716ce2562f..a621f68c6545a596541771f3429dfa6a165ddf1b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -26,6 +26,7 @@ import types
 import numpy
 import six
 
+from paddle.fluid.dygraph.container import Sequential
 from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import TranslatorLogger
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction
@@ -40,6 +41,9 @@ __all__ = ["convert_call"]
 BUILTIN_LIKELY_MODULES = [
     collections, pdb, copy, inspect, re, six, numpy, logging
 ]
+# The api(s) should be considered as plain function and convert
+# them into static layer code.
+PADDLE_NEED_CONVERT_APIS = [Sequential]
 
 translator_logger = TranslatorLogger()
 
@@ -92,6 +96,10 @@ def is_unsupported(func):
                     format(func))
                 return True
 
+    # NOTE: should be placed before `is_paddle_func`
+    if type(func) in PADDLE_NEED_CONVERT_APIS:
+        return False
+
     if is_paddle_func(func):
         translator_logger.log(
             2,
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
index 4b3b9fcf298855ae09856636e3e7af40ae8ae6da..cbe6b8a0ff942846d9e751a3daa2e02e4eefbb6b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
@@ -368,5 +368,8 @@ class StaticAnalysisVisitor(object):
 
             if isinstance(node.func, gast.Name):
                 return self.var_env.get_var_type(node.func.id)
+        if isinstance(node, gast.Subscript):
+            if self.is_tensor_node(node.value):
+                return {NodeVarType.TENSOR}
 
         return {NodeVarType.STATEMENT}
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 1bde7ef8ab099c87e48988568318134a08f04ec8..ecf6be1a0224af6b89033d6e279e3c2cfe3ef192 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -873,6 +873,10 @@ class Layer(core.Layer):
         pass
 
     def __call__(self, *inputs, **kwargs):
+        # NOTE(Aurelius84): Why we still need param_guard here?
+        # In case of ControlFlow, true_fn and false_fn will contain
+        # parameters that may not trigger logic of `Operator` to create
+        # them. we add this to make sure all parameters is available.
         with param_guard(self._parameters), param_guard(self._buffers):
             for forward_pre_hook in self._forward_pre_hooks.values():
                 hook_result = forward_pre_hook(self, inputs)
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 37900b7880a359be47ff7de1279587ca6a096cb4..644e25ab9183b870365736df67ad8acab0a0bad4 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -86,7 +86,7 @@ def monkey_patch_varbase():
 
         """
 
-        # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph. 
+        # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
         # It will fail. So, for propery in dygraph only, should not let it getattr(self, attr, None).
         attr_not_need_keys = ['grad']
         if isinstance(self, ParamBase):
@@ -108,6 +108,8 @@ def monkey_patch_varbase():
 
         if to_parameter or isinstance(self, ParamBase):
             del attr_kwargs['persistable']
+            # NOTE(Aurelius84): All parameters should be placed into global block.
+            attr_kwargs['block'] = attr_kwargs['block'].program.global_block()
             static_var = Parameter(**attr_kwargs)
         else:
             static_var = Variable(**attr_kwargs)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index f4cad7894a31b6998aacc3d2556848cb61c29821..a858ba783428e3d1c8870c87e51e474806c17042 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -39,6 +39,7 @@ from . import unique_name
 import paddle.version as fluid_version
 import warnings
 import functools
+from .variable_index import _getitem_impl_, _setitem_impl_
 
 __all__ = [
     'Program',
@@ -778,205 +779,6 @@ class ParameterMetaClass(VariableMetaClass):
             return issubclass(t, Parameter)
 
 
-def _getitem_impl_(var, item):
-    """
-    Slice the variable.
-
-    Args:
-        item(int/slice/tuple) : the index.
-
-    Returns:
-        Sliced variable
-    """
-
-    if not isinstance(item, tuple):
-        item = [item]
-
-    decrease_axis = []
-    slice_axis = []
-    slice_start = []
-    slice_end = []
-    slice_step = []
-    use_strided_slice = False
-    reverse_axis = []
-    target_block = default_main_program().current_block()
-
-    def fill_constant(shape, value, force_cpu=False, out=None):
-        var.block.append_op(
-            type='fill_constant',
-            inputs={},
-            outputs={'Out': [out]},
-            attrs={
-                'shape': shape,
-                'dtype': out.dtype,
-                'value': float(value),
-                'force_cpu': force_cpu
-            })
-        out.stop_gradient = True
-        return out
-
-    for dim, slice_item in enumerate(item):
-        if isinstance(slice_item, slice):
-            start = slice_item.start
-            end = slice_item.stop
-            step = slice_item.step
-
-            if start is None and end is None and step is None:
-                continue
-
-            if step is None:
-                step = 1
-
-            if start is None and end is None:
-                assert (step == -1)
-                reverse_axis.append(dim)
-                continue
-
-            if start is None:
-                start = 0
-
-            if end is None:
-                end = 10000000
-
-            if step != 1:
-                use_strided_slice = True
-
-            slice_axis.append(dim)
-            slice_start.append(start)
-            slice_end.append(end)
-            slice_step.append(step)
-        else:
-            decrease_axis.append(dim)
-            slice_axis.append(dim)
-            slice_start.append(slice_item)
-            slice_step.append(1)
-            if isinstance(slice_item, Variable):
-                temp_1 = var.block.create_var(dtype=slice_item.dtype)
-                fill_constant([1], 1, force_cpu=True, out=temp_1)
-                temp_end = target_block.create_var(dtype=slice_item.dtype)
-                target_block.append_op(
-                    type='elementwise_add',
-                    inputs={'X': slice_item,
-                            'Y': temp_1},
-                    outputs={'Out': temp_end},
-                    attrs={'axis': -1})
-                slice_end.append(temp_end)
-            else:
-                slice_end.append(slice_item + 1
-                                 if slice_item != -1 else 10000000)
-
-    def contain_var(one_list):
-        for ele in one_list:
-            if isinstance(ele, Variable):
-                return True
-        return False
-
-    def get_new_list_tensor(old_list):
-        new_list_tensor = []
-        for dim in old_list:
-            if isinstance(dim, Variable):
-                dim.stop_gradient = True
-                new_list_tensor.append(dim)
-            else:
-                assert (isinstance(dim, int))
-                temp_out = var.block.create_var(dtype='int64')
-                fill_constant([1], dim, force_cpu=True, out=temp_out)
-                new_list_tensor.append(temp_out)
-        return new_list_tensor
-
-    inputs = {'Input': [var]}
-    attrs = {
-        'axes': slice_axis,
-        'starts': [],
-        'ends': [],
-        'decrease_axis': decrease_axis
-    }
-    if (use_strided_slice == True):
-        attrs['strides'] = []
-    infer_flags = list(1 for i in range(len(slice_axis)))
-
-    # starts
-    if contain_var(slice_start):
-        inputs['StartsTensorList'] = get_new_list_tensor(slice_start)
-        for i, dim in enumerate(slice_start):
-            if isinstance(dim, Variable):
-                attrs['starts'].append(-1)
-                infer_flags[i] = -1
-            else:
-                attrs['starts'].append(dim)
-    else:
-        attrs['starts'] = slice_start
-
-    # ends
-    if contain_var(slice_end):
-        inputs['EndsTensorList'] = get_new_list_tensor(slice_end)
-        for i, dim in enumerate(slice_end):
-            if isinstance(dim, Variable):
-                attrs['ends'].append(-1)
-                infer_flags[i] = -1
-            else:
-                attrs['ends'].append(dim)
-    else:
-        attrs['ends'] = slice_end
-
-    # strides
-    if use_strided_slice == True:
-        if contain_var(slice_step):
-            inputs['StridesTensorList'] = get_new_list_tensor(slice_step)
-            for i, dim in enumerate(slice_step):
-                if isinstance(dim, Variable):
-                    attrs['strides'].append(-1)
-                    infer_flags[i] = -1
-                else:
-                    attrs['strides'].append(dim)
-        else:
-            attrs['strides'] = slice_step
-    # infer_flags
-    attrs['infer_flags'] = infer_flags
-
-    out = var
-    if use_strided_slice == False and len(slice_axis) > 0:
-        # append slice_op here
-        slice_out_var = target_block.create_var(
-            name=unique_name.generate_with_ignorable_key(var.name + "_slice"),
-            dtype=var.dtype)
-
-        target_block.append_op(
-            type="slice",
-            inputs=inputs,
-            outputs={'Out': [slice_out_var]},
-            attrs=attrs)
-
-        out = slice_out_var
-    elif use_strided_slice == True and len(slice_axis) > 0:
-        strided_slice_out_var = target_block.create_var(
-            name=unique_name.generate_with_ignorable_key(var.name +
-                                                         "_strided_slice"),
-            dtype=var.dtype)
-        target_block.append_op(
-            type="strided_slice",
-            inputs=inputs,
-            outputs={'Out': [strided_slice_out_var]},
-            attrs=attrs)
-
-        out = strided_slice_out_var
-
-    if len(reverse_axis) > 0:
-        reverse_out_var = target_block.create_var(
-            name=unique_name.generate_with_ignorable_key(var.name +
-                                                         "_slice_reverse"),
-            dtype=var.dtype)
-        target_block.append_op(
-            type="reverse",
-            inputs={'X': out},
-            outputs={'Out': [reverse_out_var]},
-            attrs={'axis': reverse_axis})
-
-        out = reverse_out_var
-
-    return out
-
-
 @six.add_metaclass(VariableMetaClass)
 class Variable(object):
     """
@@ -1832,160 +1634,7 @@ class Variable(object):
         return _getitem_impl_(self, item)
 
     def __setitem__(self, item, value):
-        inputs = {'Input': self}
-
-        # 1. Parse item
-        if not isinstance(item, tuple):
-            item = [item]
-
-        decrease_axes = []
-        axes = []
-        starts = []
-        ends = []
-        steps = []
-
-        max_integer = sys.maxsize
-
-        def replace_ellipsis(item):
-            # Use slice(None) to replace Ellipsis.
-            # For var, var.shape = [3,4,5,6]
-            #
-            #   var[..., 1:2] -> var[:, :, :, 1:2]
-            #   var[0, ...] -> var[0]
-            #   var[0, ..., 1:2] -> var[0, :, :, 1:2]
-
-            item = list(item)
-
-            # Remove Variable to skip bug when counting Ellipsis
-            item_remove_var = [
-                ele for ele in item if not isinstance(ele, Variable)
-            ]
-            ell_count = item_remove_var.count(Ellipsis)
-            if ell_count == 0:
-                return item
-            elif ell_count > 1:
-                raise IndexError(
-                    "An index can only have a single ellipsis ('...')")
-
-            ell_idx = item.index(Ellipsis)
-
-            if ell_idx == len(item) - 1:
-                return item[:-1]
-            else:
-                item[ell_idx:ell_idx + 1] = [slice(None)] * (
-                    len(self.shape) - len(item) + 1)
-
-            return item
-
-        item = replace_ellipsis(item)
-
-        for dim, slice_item in enumerate(item):
-            if isinstance(slice_item, slice):
-                start = slice_item.start
-                end = slice_item.stop
-                step = slice_item.step
-
-                if start is None and end is None and step is None:
-                    continue
-
-                step = 1 if step is None else step
-
-                # TODO: support cases when step < 1
-                if not isinstance(step, Variable) and step == 0:
-                    raise ValueError(
-                        "When assign a value to a paddle.Tensor, step can not be 0, "
-                        "but received step is {}.".format(step))
-
-                if isinstance(step, Variable) and (start is None or
-                                                   end is None):
-                    raise ValueError(
-                        "When assign a value to a paddle.Tensor, it's not supported that "
-                        "the start or end is None when the type of step is paddle.Tensor."
-                    )
-
-                if start is None:
-                    start = 0 if step > 0 else max_integer
-
-                if end is None:
-                    end = max_integer if step > 0 else (0 - max_integer)
-            else:
-                decrease_axes.append(dim)
-                start = slice_item
-                end = slice_item + 1 if slice_item != -1 else max_integer
-                step = 1
-
-            axes.append(dim)
-            starts.append(start)
-            ends.append(end)
-            steps.append(step)
-
-        attrs = {
-            'axes': axes,
-            'starts': starts,
-            'ends': ends,
-            'steps': steps,
-            'decrease_axes': decrease_axes
-        }
-
-        from .layers import utils
-        if utils._contain_var(starts):
-            inputs['StartsTensorList'] = utils._convert_to_tensor_list(starts)
-            del attrs['starts']
-        if utils._contain_var(ends):
-            inputs['EndsTensorList'] = utils._convert_to_tensor_list(ends)
-            del attrs['ends']
-        if utils._contain_var(steps):
-            inputs['StepsTensorList'] = utils._convert_to_tensor_list(steps)
-            del attrs['steps']
-
-        # 2. Parse value
-        dtype = self.dtype
-        attrs['dtype'] = dtype
-
-        from .data_feeder import convert_dtype
-        #  2.1 value is an integer of float
-        if isinstance(value, (int, float)):
-            value = np.array([value]).astype(convert_dtype(dtype))
-
-        #  2.2 value is a np.ndarray
-        if isinstance(value, np.ndarray):
-            shape = list(value.shape)
-            if dtype == core.VarDesc.VarType.BOOL:
-                value_name = "bool_values"
-                values = [bool(v) for v in value.flat]
-            elif dtype == core.VarDesc.VarType.FP32:
-                value_name = "fp32_values"
-                values = [float(v) for v in value.flat]
-            elif dtype == core.VarDesc.VarType.FP64:
-                value_name = "fp64_values"
-                values = [float(v) for v in value.flat]
-            elif dtype == core.VarDesc.VarType.INT32:
-                value_name = "int32_values"
-                values = [int(v) for v in value.flat]
-            elif dtype == core.VarDesc.VarType.INT64:
-                value_name = "int64_values"
-                values = [int(v) for v in value.flat]
-            else:
-                raise TypeError(
-                    "When assign a numpy.ndarray, integer or float to a paddle.Tensor, "
-                    "the data type of the paddle.Tensor must be bool, float32, int32 or int64, but "
-                    "received %s." % convert_dtype(dtype))
-            attrs[value_name] = values
-            attrs["shape"] = shape
-
-        elif isinstance(value, Variable):
-            inputs["ValueTensor"] = value
-        else:
-            raise TypeError(
-                "Only support to assign an integer, float, numpy.ndarray or "
-                "paddle.Tensor to a paddle.Tensor, but received {}".format(
-                    type(value)))
-
-        cur_block = default_main_program().current_block()
-        cur_block.append_op(
-            type="set_value", inputs=inputs, outputs={'Out': self}, attrs=attrs)
-
-        return self
+        return _setitem_impl_(self, item, value)
 
     def get_value(self, scope=None):
         """
@@ -3222,14 +2871,22 @@ class Block(object):
                                        if attrs else {},
                                        kwargs.get("stop_gradient", False))
         else:
+            from paddle.fluid.dygraph.base import param_guard
+
             op_desc = self.desc.append_op()
-            op = Operator(
-                block=self,
-                desc=op_desc,
-                type=kwargs.get("type", None),
-                inputs=kwargs.get("inputs", None),
-                outputs=kwargs.get("outputs", None),
-                attrs=kwargs.get("attrs", None))
+            # NOTE(Aurelius84): In case of @to_static, all VarBase(s) should
+            # be converted into Variable(s) with same name and block location.
+            # This is ONE and ONLY logic of type transformation of dy2static.
+            inputs = kwargs.get("inputs", None)
+            outputs = kwargs.get("outputs", None)
+            with param_guard(inputs), param_guard(outputs):
+                op = Operator(
+                    block=self,
+                    desc=op_desc,
+                    type=kwargs.get("type", None),
+                    inputs=inputs,
+                    outputs=outputs,
+                    attrs=kwargs.get("attrs", None))
 
             self.ops.append(op)
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
index 0853d05ef3bbe64bdbe5d5a40f91435612ba3d1f..6fdca1c77a13c12de44ccbecf9023765bbe0325b 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
@@ -123,7 +123,7 @@ class DownpourServer(Server):
             support_accessor_class = [
                 'DownpourFeatureValueAccessor', 'DownpourCtrAccessor',
                 'DownpourSparseValueAccessor', 'DownpourCtrDoubleAccessor',
-                'DownpourUnitAccessor'
+                'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor'
             ]
             if strategy.get('sparse_accessor_class') is not None:
                 accessor_class = strategy.get('sparse_accessor_class')
@@ -254,7 +254,7 @@ class DownpourServer(Server):
                 table2.param = 2
                 table2.converter = converter
                 table2.deconverter = deconverter
-            elif accessor_class == 'DownpourUnitAccessor':
+            elif accessor_class == 'DownpourUnitAccessor' or accessor_class == 'DownpourDoubleUnitAccessor':
                 self.add_sparse_table_common_config(table, strategy)
                 self.add_sparse_optimizer(table.accessor.embed_sgd_param,
                                           strategy, "embed_")
@@ -380,7 +380,7 @@ class DownpourServer(Server):
         table.accessor.fea_dim = fea_dim
 
     def add_sparse_optimizer(self, sgd, strategy, prefix):
-        optimizer_name = strategy.get(prefix + "sparse_optimizer", "adam")
+        optimizer_name = strategy.get(prefix + "sparse_optimizer", "adagrad")
         sgd.name = optimizer_name
         if optimizer_name == "naive":
             sgd.naive.learning_rate = \
@@ -394,6 +394,19 @@ class DownpourServer(Server):
                 strategy.get(prefix + 'sparse_learning_rate', 0.05)
             sgd.adagrad.initial_range = \
                 strategy.get(prefix + 'sparse_initial_range', 1e-4)
+            if prefix == "embed_":
+                sgd.adagrad.initial_range = 0
+            sgd.adagrad.initial_g2sum = strategy.get(
+                prefix + 'sparse_initial_g2sum', 3)
+            bounds = strategy.get(prefix + 'sparse_weight_bounds', [-10, 10])
+            sgd.adagrad.weight_bounds.extend(bounds)
+        elif optimizer_name == "std_adagrad":
+            sgd.adagrad.learning_rate = \
+                strategy.get(prefix + 'sparse_learning_rate', 0.05)
+            sgd.adagrad.initial_range = \
+                strategy.get(prefix + 'sparse_initial_range', 1e-4)
+            if prefix == "embed_":
+                sgd.adagrad.initial_range = 0
             sgd.adagrad.initial_g2sum = strategy.get(
                 prefix + 'sparse_initial_g2sum', 3)
             bounds = strategy.get(prefix + 'sparse_weight_bounds', [-10, 10])
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index f83dfd6a4eb1463921149c022ea5074062dd254e..884afb97e8f756e9d45ceb1c16121e48911eb9f4 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -489,6 +489,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
                 # user do not have to set it in config_fleet
                 if accessor == "DownpourFeatureValueAccessor" \
                         or accessor == "DownpourCtrAccessor" \
+                        or accessor == "DownpourDoubleUnitAccessor" \
                         or accessor == "DownpourUnitAccessor":
                     if st.get("sparse_embedx_dim") is not None \
                             and st["sparse_embedx_dim"] != emb_to_size[key] - 3:
@@ -769,7 +770,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
         if server._server.downpour_server_param.downpour_table_param[
                 0].accessor.accessor_class in [
                     "DownpourCtrAccessor", "DownpourCtrDoubleAccessor",
-                    "DownpourUnitAccessor"
+                    "DownpourUnitAccessor", "DownpourDoubleUnitAccessor"
                 ]:
             opt_info["dump_slot"] = True
         elif server._server.downpour_server_param.downpour_table_param[
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 30baa2aa26cda3be0ea05e1e55ae3c8999b33740..30a0b4053e6ffa26274e9eb99c311e4138cb815c 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1788,7 +1788,7 @@ def _legacy_save(param_dict, model_path, protocol=2):
 
 
 @static_only
-def save(program, model_path, protocol=2, **configs):
+def save(program, model_path, protocol=4, **configs):
     """
     :api_attr: Static Graph
 
@@ -1802,7 +1802,7 @@ def save(program, model_path, protocol=2, **configs):
         program(Program) : The program to saved.
         model_path(str): the file prefix to save the program. The format is "dirname/file_prefix". If file_prefix is empty str. A exception will be raised
         protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
-                                 Default: 2
+                                 Default: 4
         configs(dict, optional) : optional keyword arguments.                        
 
     Returns:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index aa021c463bf3d70a27290f5251614ff477586051..f87485c6a8f220f13bf3391de48a68a43b700cc5 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -14772,7 +14772,7 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
     the size of the last shard will be less than the calculated `shard_size`
 
     Args:
-        input (Tensor): Input indices with data type int64. It's last dimension must be 1.
+        input (Tensor): Input indices with data type int64 or int32. It's last dimension must be 1.
         index_num (int): An integer defining the range of the index.
         nshards (int): The number of shards.
         shard_id (int): The index of the current shard.
@@ -14793,7 +14793,7 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
             print(shard_label)
             # [[-1], [1]]
     """
-    check_variable_and_dtype(input, 'input', ['int64'], 'shard_index')
+    check_variable_and_dtype(input, 'input', ['int64', 'int32'], 'shard_index')
     op_type = 'shard_index'
     helper = LayerHelper(op_type, **locals())
     if shard_id < 0 or shard_id >= nshards:
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index c0c07f593a3ed2f611c12982300e51ce1ff69664..987918493d3b4aa1d313403cbb1aa6ffc8c8e6e9 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -580,8 +580,12 @@ def assign(input, output=None):
         input = numpy.array([input])
     elif isinstance(input, (list, tuple)):
         input = numpy.array(input)
-
-    if isinstance(input, Variable):
+    # NOTE(Aurelius84): Why we judge core.VarBase?
+    # In case of @to_static, a VarBase can be as input of `assign`,
+    # but in_dygraph_mode()==False under @to_static, which means
+    # isinstance(VarBase, Variable) == False. It will cause return None
+    # after this api.
+    if isinstance(input, (Variable, core.VarBase)):
         check_dtype(input.dtype, 'input', [
             'float16', 'uint16', 'float32', 'float64', 'int32', 'int64', 'bool'
         ], 'assign', '(When the type of input in assign is Variable.)')
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 41b2843ea33e936ccb2b75831bcbad4b7cb7cb4e..c0b93c83f78e125cf628031580550036fdc2692e 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -125,6 +125,8 @@ class Optimizer(object):
         # to train. These variables are called accumulators.
         # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
         self._accumulators = defaultdict(lambda: dict())
+        # global_accumulator dict, {accum_name : acc_variable, ...}
+        self._global_accumulators = {}
         self.helper = None
         self._opti_name_list = []
         self._accumulators_holder = {}
@@ -157,6 +159,8 @@ class Optimizer(object):
         for k, v in self._accumulators.items():
             for para_name, var_tmp in v.items():
                 state_dict[var_tmp.name] = var_tmp
+        for k, v in self._global_accumulators.items():
+            state_dict[v.name] = v
         # global step if use lr decay
         if isinstance(self._learning_rate, LRScheduler):
             state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
@@ -236,36 +240,42 @@ class Optimizer(object):
                         "Type not supprt, value in state dict must be [VarBase, Variable, numpy], the type is ",
                         type(global_step))
 
+        def _load_state_para(state_dict, param):
+            var = param.value()
+            tensor = var.get_tensor()
+            model_np = np.array(tensor)
+            load_para = state_dict[param.name]
+            if isinstance(load_para, Variable):
+                load_para_np = load_para.numpy()
+            elif isinstance(load_para, core.VarBase):
+                load_para_np = load_para.numpy()
+            elif isinstance(load_para, np.ndarray):
+                load_para_np = load_para
+            else:
+                raise RuntimeError("State dict type {} not supprt".format(
+                    str(type(load_para))))
+
+            assert model_np.shape == load_para_np.shape,  \
+                                        "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
+                                                item.name, model_np.shape, load_para_np.shape)
+
+            assert model_np.dtype == load_para_np.dtype, \
+                                        "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
+                                            item.name, model_np.dtype, load_para_np.dtype)
+
+            tensor.set(load_para_np, framework._current_expected_place())
+
         self._accumulators_holder = state_dict
         for k, v in self._accumulators.items():
             for para_name, var_tmp in v.items():
                 assert var_tmp.name in state_dict, \
                         "optimizer variable {} not found".format( var_tmp.name )
-                var = var_tmp.value()
-                tensor = var.get_tensor()
-                model_np = np.array(tensor)
-
-                load_para = state_dict[var_tmp.name]
-
-                if isinstance(load_para, Variable):
-                    load_para_np = load_para.numpy()
-                elif isinstance(load_para, core.VarBase):
-                    load_para_np = load_para.numpy()
-                elif isinstance(load_para, np.ndarray):
-                    load_para_np = load_para
-                else:
-                    raise RuntimeError("State dict type {} not supprt".format(
-                        str(type(load_para))))
-
-                assert model_np.shape == load_para_np.shape,  \
-                                          "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
-                                                 item.name, model_np.shape, load_para_np.shape)
-
-                assert model_np.dtype == load_para_np.dtype, \
-                                          "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
-                                                item.name, model_np.dtype, load_para_np.dtype)
+                _load_state_para(state_dict, var_tmp)
 
-                tensor.set(load_para_np, framework._current_expected_place())
+        for k, v in self._global_accumulators.items():
+            assert v.name in state_dict, \
+                        "optimizer variable {} not found".format( v.name )
+            _load_state_para(state_dict, v)
 
     # [aliases] Compatible with old method names
     set_dict = set_state_dict
@@ -589,6 +599,60 @@ class Optimizer(object):
         self._accumulators[name][param.name] = var
         return var
 
+    def _add_global_accumulator(self,
+                                name,
+                                dtype=None,
+                                fill_value=0.0,
+                                shape=None,
+                                type=None,
+                                device=None):
+        """Utility function to add a global accumulator for all parameters in the model
+
+        Args:
+            block: the block in which the loss variable is present
+            name: name of the accumulator
+            dtype: data type of the accumulator variable
+            fill_value: value to initialize the accumulator variable
+            shape: the shape of the accumulator
+            type: the variable type of the accumulator
+            device: the target place of the accumulator
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        if (name in self._global_accumulators):
+            if framework.in_dygraph_mode():
+                return self._global_accumulators[name]
+            raise Exception("Global accumulator {} already exists".format(name))
+        if shape == None:
+            shape = [1]  # most case, global accumulator is of shape [1]
+        assert isinstance(self.helper, LayerHelper)
+
+        var_name = name
+        var_name = unique_name.generate(var_name)
+        self._opti_name_list.append(var_name)
+
+        var = self.helper.create_global_variable(
+            name=var_name,
+            persistable=True,
+            dtype=dtype if dtype else self._dtype,
+            type=type,
+            shape=shape,
+            belong_to_optimizer=True)
+        if device is None:
+            device = 'cpu'
+        with device_guard(device):
+            self.helper.set_variable_initializer(
+                var, initializer=Constant(value=float(fill_value)))
+
+        if framework.in_dygraph_mode():
+            if len(self._accumulators_holder) > 0:
+                assert var_name in self._accumulators_holder, \
+                        "Optimizer set error, {} should in state dict".format( var_name )
+                var.set_value(self._accumulators_holder[var_name])
+
+        self._global_accumulators[name] = var
+        return var
+
     def _get_accumulator(self, name, param):
         """Utility function to fetch an accumulator for a parameter
 
@@ -597,7 +661,7 @@ class Optimizer(object):
             param: parameter variable for which accumulator is to be fetched
 
         Returns:
-            accumulator variable for the parameter
+            accumulator variable
         """
         if self._name is not None:
             name = self._name + "_" + name
@@ -607,6 +671,21 @@ class Optimizer(object):
                             format(name, param.name))
         return self._accumulators[name][param.name]
 
+    def _get_global_accumulator(self, name):
+        """Utility function to fetch a global accumulator
+
+        Args:
+            name: name of the accumulator
+
+        Returns:
+            accumulator variable
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        if (name not in self._global_accumulators):
+            raise Exception("Global accumulator {} does not exist".format(name))
+        return self._global_accumulators[name]
+
     def _update_param_device_map(self, parameters_and_grads, target_block):
         for param_and_grad in parameters_and_grads:
             if param_and_grad[0].trainable is True:
@@ -1915,6 +1994,8 @@ class AdamOptimizer(Optimizer):
             gradient in current mini-batch, so it will be much more faster. But this mode has
             different semantics with the original Adam algorithm and may lead to different result.
             The default value is False.
+        use_global_beta_pow (bool, optional): Whether to use global beta_pow. If true, Adam will use global beta_pow 
+            for whole model instead of creating beta_pow for each parameter. Default is false.
 
     Examples:
         .. code-block:: python
@@ -2024,7 +2105,8 @@ class AdamOptimizer(Optimizer):
                  regularization=None,
                  grad_clip=None,
                  name=None,
-                 lazy_mode=False):
+                 lazy_mode=False,
+                 use_global_beta_pow=False):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
@@ -2040,6 +2122,7 @@ class AdamOptimizer(Optimizer):
         self._beta2 = beta2
         self._epsilon = epsilon
         self._lazy_mode = lazy_mode
+        self._use_global_beta_pow = use_global_beta_pow
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -2048,16 +2131,30 @@ class AdamOptimizer(Optimizer):
         for p in parameters:
             self._add_accumulator(self._moment1_acc_str, p)
             self._add_accumulator(self._moment2_acc_str, p)
-            self._add_accumulator(
+            if not self._use_global_beta_pow:
+                self._add_accumulator(
+                    name=self._beta1_pow_acc_str,
+                    param=p,
+                    fill_value=0.9 if isinstance(self._beta1, Variable) \
+                            else self._beta1,
+                    shape=[1],
+                    type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+                self._add_accumulator(
+                    name=self._beta2_pow_acc_str,
+                    param=p,
+                    fill_value=0.999 if isinstance(self._beta2, Variable) \
+                            else self._beta2,
+                    shape=[1],
+                    type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+        if self._use_global_beta_pow:
+            self._add_global_accumulator(
                 name=self._beta1_pow_acc_str,
-                param=p,
                 fill_value=0.9 if isinstance(self._beta1, Variable) \
                         else self._beta1,
                 shape=[1],
                 type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
-            self._add_accumulator(
+            self._add_global_accumulator(
                 name=self._beta2_pow_acc_str,
-                param=p,
                 fill_value=0.999 if isinstance(self._beta2, Variable) \
                         else self._beta2,
                 shape=[1],
@@ -2070,10 +2167,16 @@ class AdamOptimizer(Optimizer):
                                         param_and_grad[0])
         moment2 = self._get_accumulator(self._moment2_acc_str,
                                         param_and_grad[0])
-        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                              param_and_grad[0])
-        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
-                                              param_and_grad[0])
+        if self._use_global_beta_pow:
+            beta1_pow_acc = self._get_global_accumulator(
+                self._beta1_pow_acc_str)
+            beta2_pow_acc = self._get_global_accumulator(
+                self._beta2_pow_acc_str)
+        else:
+            beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                                  param_and_grad[0])
+            beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                                  param_and_grad[0])
         lr = self._create_param_lr(param_and_grad)
         # create the adam optimize op
 
@@ -2087,7 +2190,8 @@ class AdamOptimizer(Optimizer):
                 beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
                 moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
                 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
-                1000, 'beta1', _beta1, 'beta2', _beta2)
+                1000, 'beta1', _beta1, 'beta2', _beta2, 'use_global_beta_pow',
+                self._use_global_beta_pow)
 
             return None
 
@@ -2109,7 +2213,8 @@ class AdamOptimizer(Optimizer):
         }
         attrs = {
             "lazy_mode": self._lazy_mode,
-            "min_row_size_to_use_multithread": 1000
+            "min_row_size_to_use_multithread": 1000,
+            'use_global_beta_pow': self._use_global_beta_pow
         }
 
         if isinstance(self._beta1, Variable):
@@ -2134,6 +2239,43 @@ class AdamOptimizer(Optimizer):
 
         return adam_op
 
+    def _finish_update(self, block, parameters_and_grads):
+        r"""Update beta1_pow and beta2_pow accumulator
+        """
+        assert isinstance(block, framework.Block)
+        if self._use_global_beta_pow:
+            beta1_pow_acc = self._get_global_accumulator(
+                self._beta1_pow_acc_str)
+            beta2_pow_acc = self._get_global_accumulator(
+                self._beta2_pow_acc_str)
+
+            with block.program._optimized_guard([]):
+                inputs = {"X": beta1_pow_acc}
+                attrs = {}
+                if isinstance(self._beta1, Variable):
+                    inputs['ScaleTensor'] = self._beta1
+                else:
+                    attrs['scale'] = self._beta1
+                block.append_op(
+                    type="scale",
+                    inputs=inputs,
+                    outputs={"Out": beta1_pow_acc},
+                    attrs=attrs,
+                    stop_gradient=True)
+
+                inputs = {"X": beta2_pow_acc}
+                attrs = {}
+                if isinstance(self._beta2, Variable):
+                    inputs['ScaleTensor'] = self._beta2
+                else:
+                    attrs['scale'] = self._beta2
+                block.append_op(
+                    type="scale",
+                    inputs=inputs,
+                    outputs={"Out": beta2_pow_acc},
+                    attrs=attrs,
+                    stop_gradient=True)
+
 
 class AdamaxOptimizer(Optimizer):
     r"""
@@ -4116,7 +4258,7 @@ class PipelineOptimizer(object):
         device = op.attr(self._op_device_key) \
             if op.has_attr(self._op_device_key) else None
         if device:
-            assert device[0:3] == 'gpu' or dev_type == 'npu', "Now, only gpu and npu devices are " \
+            assert device[0:3] == 'gpu' or device[0:3] == 'npu', "Now, only gpu and npu devices are " \
                 "supported in pipeline parallemism."
         return device
 
@@ -4200,6 +4342,8 @@ class PipelineOptimizer(object):
                     op.type == 'elementwise_div'):
                 device = "gpu:all"
             op._set_attr(self._op_device_key, device)
+        elif op.type == "alloc_float_status":
+            op._set_attr(self._op_device_key, "gpu:all")
         else:
             other_known_ops = [
                 'update_loss_scaling',
@@ -4207,6 +4351,7 @@ class PipelineOptimizer(object):
                 'concat',
                 'sum',
                 'check_finite_and_unscale',
+                'alloc_float_status',
             ]
             assert op.type in other_known_ops, "For other ops without " \
                 "op_device set, they must be one of {}, but it " \
@@ -4272,8 +4417,9 @@ class PipelineOptimizer(object):
                             "{} has not been set.".format(op.type))
             if device == "gpu:all": continue
             dev_type = device.split(':')[0]
-            assert dev_type == "gpu", ("Now only gpu devices are supported "
-                                       "for pipeline parallelism.")
+            assert dev_type == "gpu" or dev_type == 'npu', (
+                "Now only gpu and npu devices are supported "
+                "for pipeline parallelism.")
             if not device in device_list:
                 device_list.append(device)
         return device_list
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 110665186c0e2bded12274f8a3883bb8c028dd10..c4a256f0e193d750516baffb6184e273ce1ba246 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -23,7 +23,8 @@ list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_layer)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
@@ -179,7 +180,8 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel)
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_layer)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_parallel)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_tensor_parallel)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
@@ -558,13 +560,13 @@ if(WITH_DISTRIBUTE)
         set(dist_ut_port 20001)
         foreach(TEST_OP ${DIST_TEST_OPS})
             bash_test_modules(${TEST_OP} START_BASH dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
-            MATH(EXPR dist_ut_port "${dist_ut_port}+40")
+            MATH(EXPR dist_ut_port "${dist_ut_port}+35")
             if(dist_ut_port GREATER_EQUAL 22998)
                 message(FATAL_ERROR "available ports have been exhausted:${dist_ut_port}")
             endif()
         endforeach(TEST_OP)
         # solve it later.
-        # bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+        bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
         bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
     endif(NOT APPLE)
 endif()
@@ -866,7 +868,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_pipeline_layer PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
         set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py
index cfe70cf29223920efc8a5705ecd64c8484cbe9d3..815018dc4b2f4e56881aa9d2a09c91f3b48b87c4 100644
--- a/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py
+++ b/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py
@@ -69,7 +69,7 @@ class TestColumnParallelLinearAPI(TestCollectiveAPIRunnerBase):
                 axis=1,
                 num_partitions=2,
                 weight_attr=param_attr,
-                bias_attr=False, )
+                bias_attr=True, )
 
             return [linear_out]
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py
new file mode 100644
index 0000000000000000000000000000000000000000..647c9e9672cf0c0cfca8ac64e100478c3e9f5fe7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+import numpy as np
+
+
+class BufferLayers(paddle.nn.Layer):
+    def __init__(self, out_channel):
+        super(BufferLayers, self).__init__()
+        self.out_channel = out_channel
+
+    def forward(self, x):
+        mean = paddle.mean(x)
+        if mean < 0.:
+            x = x * self._mask()
+
+        out = x - mean
+        return out
+
+    def _mask(self):
+        return paddle.to_tensor(np.zeros([self.out_channel], 'float32'))
+
+
+class SequentialNet(paddle.nn.Layer):
+    def __init__(self, sub_layer, in_channel, out_channel):
+        super(SequentialNet, self).__init__()
+        self.layer = paddle.nn.Sequential(
+            ('l1', paddle.nn.Linear(in_channel, in_channel)),
+            ('l2', paddle.nn.Linear(in_channel, out_channel)),
+            ('l3', sub_layer(out_channel)))
+
+    def forward(self, x):
+        out = self.layer(x)
+        return out
+
+
+class TestSequential(unittest.TestCase):
+    def setUp(self):
+        paddle.set_device('cpu')
+        self.seed = 2021
+
+    def _init_seed(self):
+        paddle.seed(self.seed)
+        np.random.seed(self.seed)
+
+    def _run(self, to_static):
+        self._init_seed()
+        net = SequentialNet(BufferLayers, 10, 3)
+        if to_static:
+            net = paddle.jit.to_static(net)
+        x = paddle.rand([16, 10], 'float32')
+        out = net(x)
+        if to_static:
+            load_out = self._test_load(net, x)
+            self.assertTrue(
+                np.allclose(load_out, out),
+                msg='load_out is {}\st_out is {}'.format(load_out, out))
+
+        return out
+
+    def test_train(self):
+        paddle.jit.set_code_level(100)
+        dy_out = self._run(to_static=False)
+        st_out = self._run(to_static=True)
+        self.assertTrue(
+            np.allclose(dy_out, st_out),
+            msg='dygraph_res is {}\nstatic_res is {}'.format(dy_out, st_out))
+
+    def _test_load(self, net, x):
+        model_path = './sequential_net'
+        paddle.jit.save(net, model_path)
+        load_net = paddle.jit.load(model_path)
+        out = load_net(x)
+        return out
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
index 0243ef3a6ddae92efc14ee20d058481be8fa669e..8da4e200cfc3660146ce9cd7191200090109bcd0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 
+import paddle
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.jit import declarative
@@ -61,6 +62,30 @@ def test_list_append_in_for_loop(x, iter_num):
     return a[0]
 
 
+def test_list_append_in_for_subscript(x):
+    x = fluid.dygraph.to_variable(x)
+    iter_num = paddle.shape(x)[0]
+    a = []
+    for i in range(iter_num):
+        x = x + 1
+        a.append(x)
+    out = paddle.concat(a)
+    return out[0]
+
+
+def test_list_append_in_while_loop_subscript(x):
+    x = fluid.dygraph.to_variable(x)
+    iter_num = paddle.shape(x)[0]
+    a = []
+    i = 0
+    while i < iter_num:
+        x = x + 1
+        a.append(x)
+        i += 1
+    out = paddle.concat(a)
+    return out[0]
+
+
 def test_list_append_in_for_loop_with_concat(x, iter_num):
     x = fluid.dygraph.to_variable(x)
     a = []
@@ -261,5 +286,16 @@ class TestListInForLoopWithConcat(TestListInWhileLoopWithStack):
         self.all_dygraph_funcs = [test_list_append_in_for_loop_with_concat, ]
 
 
+class TestListInForLoopWithSubscript(TestListWithoutControlFlow):
+    def init_dygraph_func(self):
+        self.all_dygraph_funcs = [
+            test_list_append_in_for_subscript,
+            test_list_append_in_while_loop_subscript
+        ]
+
+    def init_data(self):
+        self.input = np.random.random((3, 4)).astype('float32')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_param_guard.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_param_guard.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd3c76412feac90faead8965b7161a6e35d77e66
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_param_guard.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import unittest
+
+from paddle.jit import to_static, ProgramTranslator
+
+
+class NetWithParameterList(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(NetWithParameterList, self).__init__()
+        weight = self.create_parameter([in_size, out_size])
+        bias = self.create_parameter([out_size], is_bias=True)
+        self.params = paddle.nn.ParameterList([weight, bias])
+
+    @to_static
+    def forward(self, x):
+        out = paddle.matmul(x, self.params[0])
+        out = paddle.add(out, self.params[1])
+        out = paddle.tanh(out)
+        return out
+
+
+class NetWithParameterListIter(NetWithParameterList):
+    def __init__(self, in_size, out_size):
+        super(NetWithParameterListIter, self).__init__(in_size, out_size)
+
+    @to_static
+    def forward(self, x):
+        # NOTE: manually trigger `__iter__` logic.
+        params = list(self.params.__iter__())
+        out = paddle.matmul(x, params[0])
+        out = paddle.add(out, params[1])
+        out = paddle.tanh(out)
+        return out
+
+
+class TestParameterList(unittest.TestCase):
+    def setUp(self):
+        self.seed = 2021
+        self.iter_num = 5
+        self.prog_trans = ProgramTranslator()
+
+    def train(self, is_iter, to_static):
+        paddle.seed(self.seed)
+        np.random.seed(self.seed)
+        self.prog_trans.enable(to_static)
+        if is_iter:
+            net = NetWithParameterList(10, 3)
+        else:
+            net = NetWithParameterListIter(10, 3)
+        sgd = paddle.optimizer.SGD(0.1, parameters=net.parameters())
+
+        for batch_id in range(self.iter_num):
+            x = paddle.rand([4, 10], dtype='float32')
+            out = net(x)
+            loss = paddle.mean(out)
+            loss.backward()
+            sgd.step()
+            sgd.clear_grad()
+
+        return loss
+
+    def test_parameter_list(self):
+        static_loss = self.train(False, to_static=True)
+        dygraph_loss = self.train(False, to_static=False)
+        self.assertTrue(
+            np.allclose(dygraph_loss, static_loss),
+            msg='dygraph result is {}\nstatic result is {}'.format(dygraph_loss,
+                                                                   static_loss))
+
+    def test_parameter_list_iter(self):
+        static_loss = self.train(True, to_static=True)
+        dygraph_loss = self.train(True, to_static=False)
+        self.assertTrue(
+            np.allclose(dygraph_loss, static_loss),
+            msg='dygraph result is {}\nstatic result is {}'.format(dygraph_loss,
+                                                                   static_loss))
+
+
+class NetWithRawParamList(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(NetWithRawParamList, self).__init__()
+        weight = self.add_parameter('w',
+                                    self.create_parameter([in_size, out_size]))
+        bias = self.add_parameter(
+            'b', self.create_parameter(
+                [out_size], is_bias=True))
+        self.params = [weight]
+        self.bias_dict = {'b': bias}
+
+    @to_static
+    def forward(self, x):
+        out = paddle.matmul(x, self.params[0])
+        out = paddle.add(out, self.bias_dict['b'])
+        out = paddle.tanh(out)
+        return out
+
+
+class TestRawParameterList(unittest.TestCase):
+    def setUp(self):
+        self.seed = 2021
+        self.iter_num = 5
+        self.prog_trans = ProgramTranslator()
+
+    def init_net(self):
+        self.net = NetWithRawParamList(10, 3)
+
+    def train(self, to_static):
+        paddle.seed(self.seed)
+        np.random.seed(self.seed)
+        self.prog_trans.enable(to_static)
+        self.init_net()
+
+        sgd = paddle.optimizer.SGD(0.1, parameters=self.net.parameters())
+
+        for batch_id in range(self.iter_num):
+            x = paddle.rand([4, 10], dtype='float32')
+            out = self.net(x)
+            loss = paddle.mean(out)
+            loss.backward()
+            sgd.step()
+            sgd.clear_grad()
+
+        return loss
+
+    def test_parameter_list(self):
+        static_loss = self.train(to_static=True)
+        dygraph_loss = self.train(to_static=False)
+        self.assertTrue(
+            np.allclose(dygraph_loss, static_loss),
+            msg='dygraph result is {}\nstatic result is {}'.format(dygraph_loss,
+                                                                   static_loss))
+
+
+class NetWithSubLayerParamList(paddle.nn.Layer):
+    def __init__(self, sub_layer):
+        super(NetWithSubLayerParamList, self).__init__()
+        self.sub_layer = sub_layer
+        self.params = [sub_layer.weight]
+        self.bias_dict = {'b': sub_layer.bias}
+
+    @to_static
+    def forward(self, x):
+        out = paddle.matmul(x, self.params[0])
+        out = paddle.add(out, self.bias_dict['b'])
+        out = paddle.tanh(out)
+        return out
+
+
+class TestSubLayerParameterList(TestRawParameterList):
+    def init_net(self):
+        fc = paddle.nn.Linear(10, 3)
+        self.net = NetWithSubLayerParamList(fc)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
index dfbef998a2f07ab697d27b19197b3cb65cb41205..349d5f82dbf545c321944c88dfce7d688c78ec89 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
@@ -231,7 +231,7 @@ class TestDistTraning(unittest.TestCase):
         # model_b
         check_group = dist.new_group(list(range(self.model_parallel_size)))
         integral_w = []
-        partial_w = model_a.embedding.embedding.weight.clone().detach()
+        partial_w = model_a.embedding.weight.clone().detach()
         paddle.distributed.all_gather(integral_w, partial_w, group=check_group)
         result_w = []
         for idx in range(len(integral_w)):
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
index 767bf5d57e74aff64d13170267785c6a8ed4347b..a9f251f3079cef2860c2599f6a8d33abf8da5fb8 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
@@ -37,6 +37,7 @@ hidden_size = 10
 inner_size = 8
 output_size = 2
 seq_length = 2
+batch_size = 4
 
 
 class SimpleMPNet(fluid.dygraph.Layer):
@@ -130,18 +131,6 @@ class SimpleDPNet(fluid.dygraph.Layer):
         return x
 
 
-class TrainDataset(Dataset):
-    def __init__(self, length):
-        self.length = length
-
-    def __len__(self):
-        return self.length
-
-    def __getitem__(self, index):
-        np_input_data = np.random.randint(0, vocab_size, (seq_length, ))
-        return np_input_data
-
-
 class TestDistMPTraning(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
@@ -178,20 +167,6 @@ class TestDistMPTraning(unittest.TestCase):
         np_fc1 = np.random.random_sample((hidden_size, inner_size))
         np_fc2 = np.random.random_sample((inner_size, hidden_size))
 
-        train_data = TrainDataset(length=10000)
-
-        train_batch_sampler = paddle.io.DistributedBatchSampler(
-            train_data,
-            batch_size=4,
-            shuffle=False,
-            num_replicas=self.data_parallel_size,
-            rank=dp_id)
-        train_data_loader = DataLoader(
-            dataset=train_data,
-            batch_sampler=train_batch_sampler,
-            num_workers=0,
-            return_list=True)
-
         model_a = SimpleMPNet(vocab_size, hidden_size, inner_size, output_size,
                               np_fc1, np_fc2, mp_id)
         optimizer_a = self.build_optimizer(model_a)
@@ -202,16 +177,17 @@ class TestDistMPTraning(unittest.TestCase):
                               np_fc1, np_fc2)
         optimizer_b = self.build_optimizer(model_b)
 
-        return model_a, optimizer_a, model_b, optimizer_b, train_data_loader
+        return model_a, optimizer_a, model_b, optimizer_b
 
     def test_mp_model(self):
-        model_a, optimizer_a, model_b, optimizer_b, train_data_loader = self.build_model_optimizer(
+        model_a, optimizer_a, model_b, optimizer_b = self.build_model_optimizer(
         )
 
-        for step, batch in enumerate(train_data_loader):
-            if step > 5:
-                return
-
+        for _ in range(5):
+            np_data = np.random.randint(0, vocab_size, (
+                batch_size,
+                seq_length, ))
+            batch = paddle.to_tensor(np_data)
             loss_a = self.train_batch(batch, model_a, optimizer_a, True)
             loss_b = self.train_batch(batch, model_b, optimizer_b, False)
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..912849ffbeb71c953e9316a701e5add24ddf4440
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import paddle
+import numpy as np
+import random
+import paddle
+import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+from hybrid_parallel_pp_layer import AlexNetPipeDesc, AlexNet
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + dp_id)
+
+
+batch_size = 4
+micro_batch_size = 2
+
+
+class TestDistPPTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 1
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": self.pipeline_parallel_size,
+        }
+        strategy.pipeline_configs = {
+            "accumulate_steps": batch_size // micro_batch_size,
+            "micro_batch_size": micro_batch_size
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_pp_model(self):
+        hcg = fleet.get_hybrid_communicate_group()
+        word_size = hcg.get_model_parallel_world_size()
+        dp_id = hcg.get_data_parallel_rank()
+        pp_id = hcg.get_stage_id()
+        rank_id = dist.get_rank()
+        set_random_seed(1024, dp_id, rank_id)
+
+        #construct model a
+        model_a = AlexNet(10)
+        scheduler_a = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a,
+                                           parameters=model_a.parameters())
+
+        param_len = len(model_a.parameters())
+
+        parameters = []
+        for param in model_a.parameters():
+            parameters.append(param.numpy())
+
+        # construct model b
+        model_b = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size)
+        scheduler_b = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b,
+                                           parameters=model_b.parameters())
+        model_b = fleet.distributed_model(model_b)
+        optimizer_b = fleet.distributed_optimizer(optimizer_b)
+
+        for idx, param in enumerate(model_b.parameters()):
+            param.set_value(parameters[idx + pp_id * (param_len // 2)])
+
+        # construct reader
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size, drop_last=True)
+
+        for step_id, data in enumerate(train_reader()):
+            x_data = np.array([x[0] for x in data]).astype('float32').reshape(
+                batch_size, 1, 28, 28)
+            y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                batch_size, 1)
+            img = paddle.to_tensor(x_data)
+            label = paddle.to_tensor(y_data)
+            img.stop_gradient = True
+            label.stop_gradient = True
+
+            if step_id >= 5:
+                return True
+
+            loss_a = model_a(img, label)
+            loss_a.backward()
+            optimizer_a.step()
+            optimizer_a.clear_grad()
+            scheduler_a.step()
+
+            loss_b = model_b.train_batch([img, label], optimizer_b, scheduler_b)
+
+            print("loss: ", loss_a.numpy(), loss_b.numpy())
+            np.testing.assert_allclose(
+                loss_a.numpy(), loss_b.numpy(), rtol=5e-5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_embedding.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2be0cb80722b4c540b10f05027de4514dbb5a4f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_embedding.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import paddle
+import numpy as np
+import random
+import paddle
+import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+from paddle.fluid.dygraph.container import Sequential
+from paddle.distributed.fleet.meta_parallel import PipelineLayer
+from paddle.fluid.dygraph.layers import Layer
+import paddle.nn as nn
+import paddle.fluid as fluid
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + dp_id)
+
+
+batch_size = 16
+micro_batch_size = 4
+vocab_size = 128
+hidden_size = 8
+
+
+class SimpleNet(Layer):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
+
+        self.softmax_weight = self.create_parameter(
+            shape=[hidden_size, vocab_size])
+        self.softmax_bias = self.create_parameter(
+            shape=[vocab_size], is_bias=False)
+
+    def forward(self, x1, x2, y1):
+        x_emb = self.word_embeddings(x1)
+        fc = fluid.layers.matmul(x_emb, self.softmax_weight)
+        fc = fluid.layers.elementwise_add(fc, self.softmax_bias)
+        projection = fluid.layers.reshape(fc, shape=[-1, vocab_size])
+        loss = fluid.layers.softmax_with_cross_entropy(
+            logits=projection, label=y1, soft_label=False)
+        return loss.mean()
+
+
+class EmbeddingNet(Layer):
+    def __init__(self):
+        super(EmbeddingNet, self).__init__()
+        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
+
+    def forward(self, args):
+        x1, x2 = args
+        x_emb = self.word_embeddings(x1)
+        return x_emb, x2
+
+
+class MatmulNet(Layer):
+    def __init__(self):
+        super(MatmulNet, self).__init__()
+        self.softmax_weight = self.create_parameter(
+            shape=[hidden_size, vocab_size])
+
+    def forward(self, args):
+        x1, x2 = args
+        fc = fluid.layers.matmul(x1, self.softmax_weight)
+
+        return fc, x2
+
+
+class BiasNet(Layer):
+    def __init__(self):
+        super(BiasNet, self).__init__()
+        self.softmax_bias = self.create_parameter(shape=[vocab_size])
+
+    def forward(self, args):
+        fc, x2 = args
+        fc = fluid.layers.elementwise_add(fc, self.softmax_bias)
+        projection = fluid.layers.reshape(fc, shape=[-1, vocab_size])
+        return projection, x2
+
+
+class LossNet(Layer):
+    def __init__(self):
+        super(LossNet, self).__init__()
+
+    def forward(self, args, y1):
+        projection, x2 = args
+        loss = fluid.layers.softmax_with_cross_entropy(
+            logits=projection, label=y1[0], soft_label=False)
+        return loss.mean()
+
+
+class SimpleNetPipe(Layer):
+    def __init__(self):
+        super(SimpleNetPipe, self).__init__()
+        self.features = Sequential(EmbeddingNet(), MatmulNet(), BiasNet())
+
+    def to_layers(self):
+        feat = [self.features[i] for i in range(len(self.features))]
+        return feat
+
+
+class TestDistEmbeddingTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 1
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": self.pipeline_parallel_size,
+        }
+        strategy.pipeline_configs = {
+            "accumulate_steps": batch_size // micro_batch_size,
+            "micro_batch_size": micro_batch_size
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_pp_model(self):
+        hcg = fleet.get_hybrid_communicate_group()
+        word_size = hcg.get_model_parallel_world_size()
+        dp_id = hcg.get_data_parallel_rank()
+        pp_id = hcg.get_stage_id()
+        rank_id = dist.get_rank()
+        set_random_seed(1024, dp_id, rank_id)
+
+        #construct model a
+        model_a = SimpleNet()
+        scheduler_a = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04], verbose=True)
+        optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a,
+                                           parameters=model_a.parameters())
+
+        init_net = SimpleNetPipe()
+        model_b = PipelineLayer(
+            layers=init_net.to_layers(),
+            num_stages=self.pipeline_parallel_size,
+            loss_fn=LossNet())
+
+        scheduler_b = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04], verbose=True)
+        optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b,
+                                           parameters=model_b.parameters())
+        model_b = fleet.distributed_model(model_b)
+        optimizer_b = fleet.distributed_optimizer(optimizer_b)
+
+        param_len = len(model_a.parameters())
+
+        parameters = []
+        for param in model_a.parameters():
+            print(param.name, param.shape)
+            parameters.append(param.numpy())
+
+        model_b_params = model_b.parameters()
+        if pp_id == 0:
+            model_b_params[0].set_value(parameters[2])
+        else:
+            model_b_params[0].set_value(parameters[0])
+            model_b_params[1].set_value(parameters[1])
+
+        for step in range(5):
+            x1_data = np.random.randint(0, vocab_size, size=[batch_size, 1])
+            x2_data = np.random.randint(0, vocab_size, size=[batch_size, 1])
+            y1_data = np.random.randint(0, 10, size=[batch_size, 1])
+
+            x1 = paddle.to_tensor(x1_data)
+            x2 = paddle.to_tensor(x2_data)
+            y1 = paddle.to_tensor(y1_data)
+
+            x1.stop_gradient = True
+            x2.stop_gradient = True
+            y1.stop_gradient = True
+
+            loss_a = model_a(x1, x2, y1)
+            loss_a.backward()
+            optimizer_a.step()
+            optimizer_a.clear_grad()
+            scheduler_a.step()
+
+            loss_b = model_b.train_batch([(x1, x2), (y1, )], optimizer_b,
+                                         scheduler_b)
+
+            print("loss", loss_a.numpy(), loss_b.numpy())
+            np.testing.assert_allclose(loss_a.numpy(), loss_b.numpy())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
index 3130cbf458467acfc70d38a438aa845c40584469..b30df0e9a2f21ba6d9dea0624d3129eae9b32d74 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
@@ -12,17 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import unittest
 import numpy as np
 import os
 import paddle
 from paddle.distributed import fleet
-import copy
 from paddle.fluid.dygraph.container import Sequential
 import paddle.nn as nn
 from paddle.fluid.dygraph.layers import Layer
 from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
 import paddle.nn.functional as F
-import unittest
+
+
+class ReshapeHelp(Layer):
+    def __init__(self, shape):
+        super(ReshapeHelp, self).__init__()
+        self.shape = shape
+
+    def forward(self, x):
+        return x.reshape(shape=self.shape)
 
 
 class AlexNet(Layer):
@@ -30,7 +38,7 @@ class AlexNet(Layer):
         super(AlexNet, self).__init__()
         self.features = Sequential(
             nn.Conv2D(
-                3, 64, kernel_size=11, stride=4, padding=5),
+                1, 64, kernel_size=11, stride=4, padding=5),
             nn.ReLU(),
             nn.MaxPool2D(
                 kernel_size=2, stride=2),
@@ -50,13 +58,14 @@ class AlexNet(Layer):
             nn.ReLU(),
             nn.MaxPool2D(
                 kernel_size=2, stride=2), )
+
+        self.reshape_layer = ReshapeHelp(shape=[-1, 256])
         self.classifier = nn.Linear(256, num_classes)
         self.loss_fn = nn.loss.CrossEntropyLoss()
 
     def forward(self, x, y):
         x = self.features(x)
-        x.flatten()
-
+        x = self.reshape_layer(x)
         x = self.classifier(x)
         return self.loss_fn(x, y)
 
@@ -64,7 +73,7 @@ class AlexNet(Layer):
 class AlexNetPipe(AlexNet):
     def to_layers(self):
         feat = [self.features[i] for i in range(len(self.features))]
-        loss_fn = [lambda x: x.flatten(), self.classifier]
+        loss_fn = [self.reshape_layer, self.classifier]
         feat.extend(loss_fn)
         return feat
 
@@ -74,7 +83,7 @@ class AlexNetPipeDesc(PipelineLayer):
         self.num_classes = num_classes
         decs = [
             LayerDesc(
-                nn.Conv2D, 3, 64, kernel_size=11, stride=4, padding=5),
+                nn.Conv2D, 1, 64, kernel_size=11, stride=4, padding=5),
             LayerDesc(nn.ReLU),
             LayerDesc(
                 nn.MaxPool2D, kernel_size=2, stride=2),
@@ -94,7 +103,8 @@ class AlexNetPipeDesc(PipelineLayer):
             F.relu,
             LayerDesc(
                 nn.MaxPool2D, kernel_size=2, stride=2),
-            lambda x: x.flatten(),
+            LayerDesc(
+                ReshapeHelp, shape=[-1, 256]),
             LayerDesc(nn.Linear, 256, self.num_classes),  # classifier
         ]
         super(AlexNetPipeDesc, self).__init__(
@@ -104,24 +114,24 @@ class AlexNetPipeDesc(PipelineLayer):
 class TestPipeLayerAPI(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
-        self.model_parallel_size = 2
+        self.pipeline_parallel_size = 2
         strategy.hybrid_configs = {
             "dp_degree": 1,
             "mp_degree": 1,
-            "pp_degree": self.model_parallel_size
+            "pp_degree": self.pipeline_parallel_size
         }
         fleet.init(is_collective=True, strategy=strategy)
         self.hcg = fleet.get_hybrid_communicate_group()
 
     def test_pipelayer_desc(self):
-        pipe_model = AlexNetPipeDesc(num_stages=self.model_parallel_size)
+        pipe_model = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size)
         np.testing.assert_array_equal(len(pipe_model.parameters()), 6)
 
     def test_pipelayer_sequential(self):
         init_net = AlexNetPipe()
         pipe_model = PipelineLayer(
             layers=init_net.to_layers(),
-            num_stages=self.model_parallel_size,
+            num_stages=self.pipeline_parallel_size,
             loss_fn=nn.CrossEntropyLoss())
         stage_id = self.hcg.get_stage_id()
         init_parameters = init_net.parameters()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py
deleted file mode 100644
index 9b9283a1a9b6ea9e92246db974f501170fc4cb50..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import numpy as np
-import random
-import paddle.distributed as dist
-import paddle.fluid as fluid
-import paddle.distributed.fleet as fleet
-from paddle.io import DataLoader, Dataset
-import unittest
-
-
-def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
-    random.seed(seed)
-    np.random.seed(seed + dp_id)
-    paddle.seed(seed + rank_id)
-
-
-HIDDEN_DIM = 32
-LAYERS = 8
-
-
-def sequential_model():
-    model = paddle.nn.Sequential(
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, 1), )
-    return model
-
-
-class TestDistPPTraning(unittest.TestCase):
-    def setUp(self):
-        strategy = fleet.DistributedStrategy()
-        self.model_parallel_size = 1
-        self.data_parallel_size = 1
-        self.pipeline_parallel_size = 2
-        strategy.hybrid_configs = {
-            "dp_degree": self.data_parallel_size,
-            "mp_degree": self.model_parallel_size,
-            "pp_degree": self.pipeline_parallel_size,
-        }
-        strategy.pipeline_configs = {"accumulate_steps": 2}
-        paddle.distributed.init_parallel_env()
-        fleet.init(is_collective=True, strategy=strategy)
-
-    def test_mp_model(self):
-        batch_input = paddle.randn(shape=(1, HIDDEN_DIM), dtype="float32")
-        pipe_model = sequential_model()
-        sgd = paddle.optimizer.SGD(learning_rate=0.0003, parameters=[])
-        pipe_model = paddle.distributed.fleet.distributed_model(pipe_model)
-
-        if pipe_model.stage_id == 0 or pipe_model.stage_id == 1:
-            pipe_input = batch_input.clone().detach()
-            pipe_input = paddle.cast(pipe_input, 'float32')
-
-            def data_gen():
-                gen = True
-                while gen:
-                    yield [pipe_input, 0]
-                    gen = False
-
-            loader = paddle.io.DataLoader.from_generator(capacity=5)
-            loader.set_batch_generator(data_gen)
-            data_iter = iter(loader)
-        else:
-            data_iter = None
-        return True
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_cast_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_cast_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..95de37fdc0251ae9fc809de2e55d5735ef5656de
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_cast_mkldnn_op.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestCastBF16ToFP32MKLDNNOp(OpTest):
+    def init_data(self):
+        self.out = np.random.random(size=[10, 10]).astype("float32")
+        self.x = convert_float_to_uint16(self.out)
+
+    def setUp(self):
+        self.init_data()
+        self.inputs = {'X': self.x}
+        self.outputs = {'Out': self.out}
+        prepare_dtype = lambda x: int(core.VarDesc.VarType.BF16 if x.dtype != np.float32 else core.VarDesc.VarType.FP32)
+        self.attrs = {
+            'in_dtype': prepare_dtype(self.x),
+            'out_dtype': prepare_dtype(self.out),
+            'use_mkldnn': True
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[self.inputs['X']],
+            user_defined_grad_outputs=[self.outputs['Out']])
+
+
+class TestCastFP32ToBF16MKLDNNOp(TestCastBF16ToFP32MKLDNNOp):
+    def init_data(self):
+        self.x = np.random.random(size=[2, 6]).astype("float32")
+        self.out = convert_float_to_uint16(self.x)
+
+
+class TestCastBF16ToBF16MKLDNNOp(TestCastBF16ToFP32MKLDNNOp):
+    def init_data(self):
+        self.x = np.random.random(size=[6, 13]).astype("uint16")
+        self.out = self.x
+
+
+class TestCastFP32ToFP32MKLDNNOp(TestCastBF16ToFP32MKLDNNOp):
+    def init_data(self):
+        self.x = np.random.random(size=[7, 15]).astype("float32")
+        self.out = self.x
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
index ba7c8abc56daa91dda364713ffe7aa332610921c..088b4fb59057b43cc8f245938cd679973cca2cea 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
@@ -63,4 +63,6 @@ class TestLRNMKLDNNOpNHWC(TestLRNMKLDNNOp):
 
 
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_bf16_mkldnn_op.py
index 149002fc7650873ad1b87618ee2abb5012e67d12..dba63be27b43843e04d198edaeb53a2860d54291 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_bf16_mkldnn_op.py
@@ -26,22 +26,23 @@ from paddle import enable_static
                  "place does not support BF16 evaluation")
 class TestMatmulBf16MklDNNOp(OpTest):
     def generate_data(self):
-        self.x = np.random.random((25, 2, 2)).astype(np.float32)
-        self.y = np.random.random((25, 2, 2)).astype(np.float32)
-        self.alpha = 1.0
-        self.out = self.alpha * np.matmul(self.x, self.y)
+        self.x_fp32 = np.random.random((25, 2, 2)).astype(np.float32)
+        self.y_fp32 = np.random.random((25, 2, 2)).astype(np.float32)
+        self.out = self.alpha * np.matmul(self.x_fp32, self.y_fp32)
 
     def set_attributes(self):
-        self.alpha = self.alpha if hasattr(self, 'alpha') else 1.0
         self.attrs = {
             'alpha': self.alpha,
             "use_mkldnn": self.use_mkldnn,
             "mkldnn_data_type": self.mkldnn_data_type,
-            "force_fp32_output": self.force_fp32_output
+            "force_fp32_output": self.force_fp32_output,
+            'transpose_X': False,
+            'transpose_Y': False
         }
 
     def setUp(self):
         self.op_type = "matmul"
+        self.alpha = 1.0
         self.use_mkldnn = True
         self.dtype = np.uint16
         self.mkldnn_data_type = "bfloat16"
@@ -53,67 +54,113 @@ class TestMatmulBf16MklDNNOp(OpTest):
             self.out = convert_float_to_uint16(self.out)
         self.outputs = {'Out': self.out}
 
-        self.x = convert_float_to_uint16(self.x)
-        self.y = convert_float_to_uint16(self.y)
-        self.inputs = {'X': self.x, 'Y': self.y}
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.y_bf16 = convert_float_to_uint16(self.y_fp32)
+        self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16}
 
     def test_check_output(self):
         self.check_output_with_place(core.CPUPlace())
 
     def test_check_grad(self):
-        pass
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X", "Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[self.dx, self.dy],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.dout)])
+
+    def matmul_grad(self, x, transpose_x, y, transpose_y):
+        x_transpose_axes = [1, 0] if x.ndim == 2 else [0, 2, 1]
+        y_transpose_axes = [1, 0] if y.ndim == 2 else [0, 2, 1]
+
+        x = np.transpose(x, x_transpose_axes) if transpose_x else x
+        y = np.transpose(y, y_transpose_axes) if transpose_y else y
+
+        return self.alpha * np.matmul(x, y)
+
+    def calculate_grads(self):
+        x_transpose_axes = [1, 0] if self.x_fp32.ndim == 2 else [0, 2, 1]
+        y_transpose_axes = [1, 0] if self.y_fp32.ndim == 2 else [0, 2, 1]
+
+        x = np.transpose(self.x_fp32, x_transpose_axes) if self.attrs[
+            'transpose_X'] is True else self.x_fp32
+        y = np.transpose(self.y_fp32, y_transpose_axes) if self.attrs[
+            'transpose_Y'] is True else self.y_fp32
+
+        dout = self.alpha * np.matmul(x, y)
+
+        if self.attrs['transpose_X'] is True and self.attrs[
+                'transpose_Y'] is True:
+            self.dx = self.matmul_grad(self.y_fp32, True, dout, True)
+            self.dy = self.matmul_grad(dout, True, self.x_fp32, True)
+        elif self.attrs['transpose_X'] is True and self.attrs[
+                'transpose_Y'] is False:
+            self.dx = self.matmul_grad(self.y_fp32, False, dout, True)
+            self.dy = self.matmul_grad(self.x_fp32, False, dout, False)
+        elif self.attrs['transpose_X'] is False and self.attrs[
+                'transpose_Y'] is True:
+            self.dx = self.matmul_grad(dout, False, self.y_fp32, False)
+            self.dy = self.matmul_grad(dout, True, self.x_fp32, False)
+        else:
+            self.dx = self.matmul_grad(dout, False, self.y_fp32, True)
+            self.dy = self.matmul_grad(self.x_fp32, True, dout, False)
+
+        self.dout = dout
 
 
 class TestDnnlMatMulOpAlpha(TestMatmulBf16MklDNNOp):
     def generate_data(self):
-        self.x = np.random.random((17, 2, 3)).astype(np.float32)
-        self.y = np.random.random((17, 3, 2)).astype(np.float32)
+        self.x_fp32 = np.random.random((17, 2, 3)).astype(np.float32)
+        self.y_fp32 = np.random.random((17, 3, 2)).astype(np.float32)
         self.alpha = 2.0
-        self.out = self.alpha * np.matmul(self.x, self.y)
+        self.out = self.alpha * np.matmul(self.x_fp32, self.y_fp32)
 
 
 class TestDnnlMatMulOp2D(TestMatmulBf16MklDNNOp):
     def generate_data(self):
-        self.x = np.random.random((12, 9)).astype(np.float32)
-        self.y = np.random.random((9, 12)).astype(np.float32)
-        self.out = np.matmul(self.x, self.y)
+        self.x_fp32 = np.random.random((12, 9)).astype(np.float32)
+        self.y_fp32 = np.random.random((9, 12)).astype(np.float32)
+        self.out = np.matmul(self.x_fp32, self.y_fp32)
 
 
 class TestDnnlMatMulOpTransposeX(TestMatmulBf16MklDNNOp):
     def generate_data(self):
-        self.x = np.random.random((12, 9)).astype(np.float32)
-        self.y = np.random.random((12, 9)).astype(np.float32)
-        self.out = np.matmul(np.transpose(self.x), self.y)
+        self.x_fp32 = np.random.random((12, 9)).astype(np.float32)
+        self.y_fp32 = np.random.random((12, 9)).astype(np.float32)
+        self.out = np.matmul(np.transpose(self.x_fp32), self.y_fp32)
 
     def set_attributes(self):
         self.attrs = {
             "use_mkldnn": self.use_mkldnn,
             "mkldnn_data_type": self.mkldnn_data_type,
-            'transpose_X': True
+            'transpose_X': True,
+            'transpose_Y': False
         }
 
 
 class TestDnnlMatMulOpTransposeY(TestMatmulBf16MklDNNOp):
     def generate_data(self):
-        self.x = np.random.random((12, 9)).astype(np.float32)
-        self.y = np.random.random((12, 9)).astype(np.float32)
-        self.out = np.matmul(self.x, np.transpose(self.y))
+        self.x_fp32 = np.random.random((12, 9)).astype(np.float32)
+        self.y_fp32 = np.random.random((12, 9)).astype(np.float32)
+        self.out = np.matmul(self.x_fp32, np.transpose(self.y_fp32))
 
     def set_attributes(self):
         self.attrs = {
             "use_mkldnn": self.use_mkldnn,
             "mkldnn_data_type": self.mkldnn_data_type,
-            'transpose_Y': True
+            'transpose_Y': True,
+            'transpose_X': False
         }
 
 
 class TestMatmulBf16MklDNNForceFp32Output(TestMatmulBf16MklDNNOp):
     def generate_data(self):
-        self.x = np.random.random((12, 9)).astype(np.float32)
-        self.y = np.random.random((9, 12)).astype(np.float32)
+        self.x_fp32 = np.random.random((12, 9)).astype(np.float32)
+        self.y_fp32 = np.random.random((9, 12)).astype(np.float32)
         self.force_fp32_output = True
         self.alpha = 0.5
-        self.out = self.alpha * np.matmul(self.x, self.y)
+        self.out = self.alpha * np.matmul(self.x_fp32, self.y_fp32)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
index 2f557f0bf145eec64b88bc67c7192116bfb771c9..724b9d9818dc4510bff4db7bbb9ea9889df4ec93 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
@@ -19,7 +19,6 @@ import numpy as np
 from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
 
 
-@skip_check_grad_ci(reason="DNNL's MatMul doesn't implemend grad kernel.")
 class TestDnnlMatMulOp(OpTest):
     def generate_data(self):
         self.x = np.random.random((25, 2, 2)).astype("float32")
@@ -48,21 +47,99 @@ class TestDnnlMatMulOp(OpTest):
         self.check_output()
 
 
-class TestDnnlMatMulOpMixedDims1(TestDnnlMatMulOp):
+class TestDnnlMatMulWithGradOp(TestDnnlMatMulOp):
+    def test_check_grad(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-2)
+
+
+class TestDnnlMatMulOpMixedDims1(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((17, 2, 3)).astype("float32")
         self.y = np.random.random((3, 4)).astype("float32")
         self.out = np.matmul(self.x, self.y)
 
 
-class TestDnnlMatMulOpMixedDims2(TestDnnlMatMulOp):
+class TestDnnlMatMulOpMixedDimsYWiderTransposeY(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((8, 2, 3)).astype("float32")
+        self.y = np.random.random((4, 3)).astype("float32")
+        self.out = np.matmul(self.x, np.transpose(self.y))
+
+    def set_attributes(self):
+        self.attrs = {'transpose_Y': True}
+
+
+class TestDnnlMatMulOpMixedDimsYWiderTransposeX(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((8, 3, 2)).astype("float32")
+        self.y = np.random.random((3, 4)).astype("float32")
+        self.out = np.matmul(np.transpose(self.x, (0, 2, 1)), self.y)
+
+    def set_attributes(self):
+        self.attrs = {'transpose_X': True}
+
+
+class TestDnnlMatMulOpMixedDimsXWiderTransposeXY(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((8, 3, 2)).astype("float32")
+        self.y = np.random.random((4, 3)).astype("float32")
+        self.out = np.matmul(
+            np.transpose(self.x, (0, 2, 1)), np.transpose(self.y))
+
+    def set_attributes(self):
+        self.attrs = {'transpose_X': True, 'transpose_Y': True}
+
+
+class TestDnnlMatMulOpMixedDimsYWiderTransposeXY(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((3, 2)).astype("float32")
+        self.y = np.random.random((8, 4, 3)).astype("float32")
+        self.out = np.matmul(
+            np.transpose(self.x), np.transpose(self.y, (0, 2, 1)))
+
+    def set_attributes(self):
+        self.attrs = {'transpose_X': True, 'transpose_Y': True}
+
+
+class TestDnnlMatMulOpMixedDimsXWiderTransposeX(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((5, 4)).astype("float32")
+        self.y = np.random.random((8, 5, 4)).astype("float32")
+        self.out = np.matmul(np.transpose(self.x), self.y)
+
+    def set_attributes(self):
+        self.attrs = {'transpose_X': True}
+
+
+class TestDnnlMatMulOpVectorMultiply(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((5)).astype("float32")
+        self.y = np.random.random((5)).astype("float32")
+        self.out = np.matmul(self.x, self.y)
+
+
+class TestDnnlMatMulOpVectorMultiplyTranspose(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((5)).astype("float32")
+        x_resized = np.copy(self.x)
+        x_resized = np.expand_dims(x_resized, 1)
+        self.y = np.random.random((6)).astype("float32")
+        y_resized = np.copy(self.y)
+        y_resized = np.expand_dims(y_resized, 0)
+        self.out = np.matmul(x_resized, y_resized)
+
+    def set_attributes(self):
+        self.attrs = {'transpose_Y': True, 'transpose_X': True}
+
+
+class TestDnnlMatMulOpMixedDims2(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((2, 3)).astype("float32")
         self.y = np.random.random((17, 3, 4)).astype("float32")
         self.out = np.matmul(self.x, self.y)
 
 
-class TestDnnlMatMulOpAlpha(TestDnnlMatMulOp):
+class TestDnnlMatMulOpAlpha(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((17, 2, 3)).astype("float32")
         self.y = np.random.random((17, 3, 2)).astype("float32")
@@ -70,18 +147,14 @@ class TestDnnlMatMulOpAlpha(TestDnnlMatMulOp):
         self.out = self.alpha * np.matmul(self.x, self.y)
 
 
-class TestDnnlMatMulOp2D(TestDnnlMatMulOp):
-    def print_tensor(self, name, tensor):
-        print(name)
-        print(tensor)
-
+class TestDnnlMatMulOp2D(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((12, 9)).astype("float32")
         self.y = np.random.random((9, 12)).astype("float32")
         self.out = np.matmul(self.x, self.y)
 
 
-class TestDnnlMatMulOpTransposeX(TestDnnlMatMulOp):
+class TestDnnlMatMulOpTransposeX(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((12, 9)).astype("float32")
         self.y = np.random.random((12, 9)).astype("float32")
@@ -91,7 +164,7 @@ class TestDnnlMatMulOpTransposeX(TestDnnlMatMulOp):
         self.attrs = {'transpose_X': True}
 
 
-class TestDnnlMatMulOpTransposeY(TestDnnlMatMulOp):
+class TestDnnlMatMulOpTransposeY(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((12, 9)).astype("float32")
         self.y = np.random.random((12, 9)).astype("float32")
@@ -101,7 +174,7 @@ class TestDnnlMatMulOpTransposeY(TestDnnlMatMulOp):
         self.attrs = {'transpose_Y': True}
 
 
-class TestDnnlMatMulOpTransposeY3D(TestDnnlMatMulOp):
+class TestDnnlMatMulOpTransposeY3D(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((17, 3, 2)).astype("float32")
         self.y = np.random.random((17, 3, 2)).astype("float32")
@@ -480,4 +553,6 @@ class TestMatMulOpTransposeReshapeRankOfReshapeNotSupportedException(
 
 
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_scale_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_bf16_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e9f989f06c10143f0992c3e559ac59da4611dad
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_bf16_mkldnn_op.py
@@ -0,0 +1,122 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+@unittest.skipIf(core.is_compiled_with_cuda(),
+                 "core is compiled with CUDA which has no BF implementation")
+class TestScaleOpBF16(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.x_fp32 = np.random.random((10, 10)).astype(np.float32)
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.scale = -2.3
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'scale': self.scale, 'use_mkldnn': True, 'bias': 0.4}
+        self.use_mkldnn = True
+        self.outputs = {
+            'Out': (self.x_fp32 * self.attrs['scale']) + self.attrs['bias']
+        }
+
+    def calculate_grads(self):
+        bias = 0
+        if 'bias' in self.attrs:
+            bias = self.attrs['bias']
+
+        scale = self.scale
+        if 'ScaleTensor' in self.attrs:
+            scale = self.attrs['ScaleTensor']
+
+        self.out = (self.x_fp32 * scale) + bias
+        self.dx = (self.out * scale)
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+    def test_check_grad(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[self.dx],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.out)])
+
+
+class TestScaleOpBF16BiasNotAfterScale(TestScaleOpBF16):
+    def setUp(self):
+        self.op_type = "scale"
+        self.x_fp32 = np.random.random((10, 10)).astype(np.float32)
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.scale = 1.5
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {
+            'scale': self.scale,
+            'use_mkldnn': True,
+            'bias': 0.0,
+            'bias_after_scale': False
+        }
+        self.use_mkldnn = True
+        self.outputs = {
+            'Out': (self.x_fp32 + self.attrs['bias']) * self.attrs['scale']
+        }
+
+
+class TestScaleOpBF16ScaleTensor(TestScaleOpBF16):
+    def setUp(self):
+        self.op_type = "scale"
+        self.scale = -2.3
+        self.x_fp32 = np.random.random((10, 10)).astype(np.float32)
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.scale_tensor = np.array([self.scale]).astype(np.float32)
+        self.inputs = {
+            'X': self.x_bf16,
+            'ScaleTensor': convert_float_to_uint16(self.scale_tensor)
+        }
+        self.attrs = {'use_mkldnn': True}
+        self.outputs = {'Out': self.x_fp32 * self.scale}
+
+
+class TestScaleOpBF16ScaleTensorNotBiasAfterScale(TestScaleOpBF16):
+    def setUp(self):
+        self.op_type = "scale"
+        self.scale = 1.2
+        self.x_fp32 = np.random.random((9, 13)).astype(np.float32)
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.scale_tensor = np.array([self.scale]).astype(np.float32)
+        self.inputs = {
+            'X': self.x_bf16,
+            'ScaleTensor': convert_float_to_uint16(self.scale_tensor)
+        }
+        self.attrs = {
+            'bias': -1.1,
+            'bias_after_scale': False,
+            'use_mkldnn': True
+        }
+        self.outputs = {'Out': (self.x_fp32 + self.attrs['bias']) * self.scale}
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_scale_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..528b55dcd873dcec7932f5292d18c5afa3c4678f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_mkldnn_op.py
@@ -0,0 +1,104 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+
+class TestScaleOp(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.inputs = {'X': np.random.random((10, 10)).astype(np.float32)}
+        self.attrs = {'scale': -2.3, 'use_mkldnn': True, 'bias': 0.2}
+        self.use_mkldnn = True
+        self.outputs = {
+            'Out': (self.inputs['X'] * self.attrs['scale']) + self.attrs['bias']
+        }
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestScaleOpBiasNotAfterScale(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.inputs = {'X': np.random.random((10, 10)).astype(np.float32)}
+        self.attrs = {
+            'scale': 1.5,
+            'use_mkldnn': True,
+            'bias': 2.3,
+            'bias_after_scale': False
+        }
+        self.use_mkldnn = True
+        self.outputs = {
+            'Out': (self.inputs['X'] + self.attrs['bias']) * self.attrs['scale']
+        }
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestScaleOpScaleTensor(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.scale = -2.3
+        self.inputs = {
+            'X': np.random.random((10, 10)).astype(np.float32),
+            'ScaleTensor': np.array([self.scale]).astype(np.float32)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': self.inputs['X'] * self.scale}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestScaleOpScaleTensorNotBiasAfterScale(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.scale = -1.2
+        self.inputs = {
+            'X': np.random.random((10, 10)).astype(np.float32),
+            'ScaleTensor': np.array([self.scale]).astype(np.float32)
+        }
+        self.attrs = {'bias': -6.8, 'bias_after_scale': False}
+        self.outputs = {
+            'Out':
+            (self.inputs['X'] + self.attrs['bias']) * self.inputs['ScaleTensor']
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
index 9e2229cece75c2074f288be29440f3027da64e5e..13c1883af6184f2971d43c6bbd88496912c5ec67 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
@@ -129,4 +129,6 @@ class TestSoftmaxMKLDNNPrimitivesAlreadyExist(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/new_group.py b/python/paddle/fluid/tests/unittests/new_group.py
index fb7beeee1df2e8b811ec770e1aa40852a4c8c7ef..c9c4acc3220c7b41a25f8dbf2fbe9266adcfc8ff 100644
--- a/python/paddle/fluid/tests/unittests/new_group.py
+++ b/python/paddle/fluid/tests/unittests/new_group.py
@@ -27,6 +27,7 @@ class TestNewGroupAPI(object):
 
     def test_all(self):
         gp = paddle.distributed.new_group([0, 1])
+        print("gp info:", gp)
         print("test new group api ok")
 
         tmp = np.array([0, 0, 0])
diff --git a/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
index b5175bdb19c7e5bc2e981b7f76fc2b7471d73d6f..5aeca5abd9f8315e2793e6cd5ba53a283c850b85 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
@@ -35,21 +35,21 @@ class TestAccuracy(OpTest):
         self.set_npu()
         self.init_dtype()
         np.random.seed(SEED)
-        pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype)
-        label = pred.copy()
-        accuracy = np.array([1]).astype(self.dtype)
-        correct = np.array([11 * 1]).astype(self.dtype)
-        total = np.array([11 * 1]).astype(self.dtype)
-
-        self.inputs = {
-            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
-            "Label": OpTest.np_dtype_to_fluid_dtype(label),
-            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
-        }
+        n = 8192
+        infer = np.random.random((n, 1)).astype(self.dtype)
+        indices = np.random.randint(0, 2, (n, 1)).astype('int64')
+        label = np.random.randint(0, 2, (n, 1)).astype('int64')
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
         self.outputs = {
-            "Accuracy": accuracy,
-            "Correct": correct,
-            "Total": total
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
         }
 
     def set_npu(self):
@@ -69,53 +69,70 @@ class TestAccuracy2(TestAccuracy):
         self.set_npu()
         self.init_dtype()
         np.random.seed(SEED)
-        pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype)
-        label = np.random.uniform(4, 5, [11, 1]).astype(self.dtype)
-        accuracy = np.array([0]).astype(self.dtype)
-        correct = np.array([11 * 0]).astype(self.dtype)
-        total = np.array([11 * 1]).astype(self.dtype)
-
-        self.inputs = {
-            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
-            "Label": OpTest.np_dtype_to_fluid_dtype(label),
-            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
-        }
+        n = 8192
+        infer = np.random.random((n, 100)).astype(self.dtype)
+        indices = np.random.randint(0, 1000, (n, 100)).astype('int64')
+        label = np.random.randint(0, 1000, (n, 1)).astype('int64')
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
         self.outputs = {
-            "Accuracy": accuracy,
-            "Correct": correct,
-            "Total": total
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
         }
 
 
-class TestAccuracy3(TestAccuracy):
+class TestAccuracyType(TestAccuracy):
     def setUp(self):
         self.op_type = "accuracy"
         self.set_npu()
         self.init_dtype()
         np.random.seed(SEED)
-        a = np.random.randint(1, 2, [5, 1])
-        b = np.random.randint(0, 1, [5, 1])
-        pred = np.row_stack((a, b)).astype(self.dtype)
-        label = np.random.randint(1, 2, [10, 1]).astype(self.dtype)
-        accuracy = np.array([0.5]).astype(self.dtype)
-        correct = np.array([5]).astype(self.dtype)
-        total = np.array([10 * 1]).astype(self.dtype)
-
-        self.inputs = {
-            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
-            "Label": OpTest.np_dtype_to_fluid_dtype(label),
-            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
-        }
+        n = 8192
+        infer = np.random.random((n, 100)).astype(self.dtype)
+        indices = np.random.randint(0, 1000, (n, 100)).astype('int64')
+        label = np.random.randint(0, 1000, (n, 1)).astype('int32')
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
         self.outputs = {
-            "Accuracy": accuracy,
-            "Correct": correct,
-            "Total": total
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
         }
 
 
-class TestAccuracyInt(TestAccuracy):
-    def init_dtype(self):
-        self.dtype = np.int
+class TestAccuracyType2(TestAccuracy):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.set_npu()
+        self.init_dtype()
+        np.random.seed(SEED)
+        n = 8192
+        infer = np.random.random((n, 100)).astype(self.dtype)
+        indices = np.random.randint(0, 1000, (n, 100)).astype('int32')
+        label = np.random.randint(0, 1000, (n, 1)).astype('int64')
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
+        }
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
index ec616070b63abf8ca83f3ca561c14692ac39d89c..a3b4242f39d36631b546e775693cc2187c64b14b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
@@ -134,6 +134,65 @@ class TestAdamWithEpsilonTensor(OpTest):
         self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
 
 
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAdamOpWithGlobalBetaPow(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+        }
+
+        attributes = {'epsilon': epsilon}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
+
+        self.attrs = {'use_global_beta_pow': True}
+
+        # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([]),
+            'Beta2PowOut': np.array([])
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
+
+
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestNet(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index 400ddd9d4aab0775af6007da36475db72561136f..2463ddb7137acd683fde3ce2c5d09341a5c4a4d2 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -41,7 +41,7 @@ class TestLookupTableV2(OpTest):
         vocab = 10
         dim = 20
         w = np.ones([vocab, dim]).astype(self.dtype)
-        x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int64)
+        x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int32)
         out = np.ones([bsz, seqlen, dim]).astype(self.dtype)
 
         self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 25717b796771284c358bc690d80b7897d59edea4..654723d86299006b83d8f27d6090fb3e4c1082e2 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -132,6 +132,8 @@ def get_numeric_gradient(place,
         tensor_to_check_dtype = np.float16
         # set delta as np.float16, will automatic convert to float32, float64
         delta = np.array(delta).astype(np.float16)
+    elif tensor_to_check_dtype == core.VarDesc.VarType.BF16:
+        tensor_to_check_dtype = np.float32
     else:
         raise ValueError("Not supported data type " + str(
             tensor_to_check_dtype))
@@ -140,9 +142,10 @@ def get_numeric_gradient(place,
         sum = []
         op.run(scope, place)
         for output_name in output_names:
-            sum.append(
-                np.array(scope.find_var(output_name).get_tensor()).astype(
-                    tensor_to_check_dtype).mean())
+            output_numpy = np.array(scope.find_var(output_name).get_tensor())
+            if tensor_to_check._dtype() == core.VarDesc.VarType.BF16:
+                output_numpy = convert_uint16_to_float(output_numpy)
+            sum.append(output_numpy.astype(tensor_to_check_dtype).mean())
         return tensor_to_check_dtype(np.array(sum).sum() / len(output_names))
 
     gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
@@ -152,6 +155,11 @@ def get_numeric_gradient(place,
             numpy_tensor = np.array(tensor).astype(np.float16)
             numpy_tensor = numpy_tensor.flatten()
             return numpy_tensor[i]
+        elif tensor_to_check._dtype() == core.VarDesc.VarType.BF16:
+            numpy_tensor = np.array(tensor).astype(np.uint16)
+            numpy_tensor = numpy_tensor.flatten()
+            return struct.unpack('<f', struct.pack('<I', numpy_tensor[i]
+                                                   << 16))[0]
         elif tensor_to_check_dtype == np.float32:
             return tensor._get_float_element(i)
         elif tensor_to_check_dtype == np.float64:
@@ -168,6 +176,13 @@ def get_numeric_gradient(place,
             numpy_tensor[i] = e
             numpy_tensor = numpy_tensor.reshape(shape)
             tensor.set(numpy_tensor, place)
+        elif tensor_to_check._dtype() == core.VarDesc.VarType.BF16:
+            numpy_tensor = np.array(tensor).astype(np.uint16)
+            shape = numpy_tensor.shape
+            numpy_tensor = numpy_tensor.flatten()
+            numpy_tensor[i] = np.uint16(copy_bits_from_float_to_uint16(e))
+            numpy_tensor = numpy_tensor.reshape(shape)
+            tensor.set(numpy_tensor, place)
         elif tensor_to_check_dtype == np.float32:
             tensor._set_float_element(i, e)
         elif tensor_to_check_dtype == np.float64:
@@ -1087,6 +1102,7 @@ class OpTest(unittest.TestCase):
             dygraph_outs = self._calc_dygraph_output(
                 place, no_check_set=no_check_set)
         outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
+
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
             if out_name not in self.outputs:
                 continue
@@ -1175,7 +1191,16 @@ class OpTest(unittest.TestCase):
                         np.float32, np.float64
                 ]:
                     actual_t = convert_uint16_to_float(actual_t)
-                    atol = 0.03
+                    atol = max(atol, 0.03)
+
+                if expect_t.dtype == np.uint16 and actual_t.dtype == np.uint16:
+                    expect_t = convert_uint16_to_float(expect_t)
+                    actual_t = convert_uint16_to_float(actual_t)
+                    atol = max(atol, 0.03)
+                # NOTE(zhiqiu): np.allclose([], [1.]) returns True
+                # see details: https://stackoverflow.com/questions/38331703/why-does-numpys-broadcasting-sometimes-allow-comparing-arrays-of-different-leng
+                if expect_t.size == 0:
+                    self.assertTrue(actual_t.size == 0)
 
                 self.assertTrue(
                     np.allclose(
@@ -1347,6 +1372,8 @@ class OpTest(unittest.TestCase):
                 abs_a[abs_a < 1e-10] = 1e-3
                 abs_a[np.logical_and(abs_a > 1e-10, abs_a <= 1e-8)] *= 1e4
                 abs_a[np.logical_and(abs_a > 1e-8, abs_a <= 1e-6)] *= 1e2
+            elif self.is_bfloat16_op():
+                abs_a[abs_a < 1e-2] = 1
             else:
                 abs_a[abs_a < 1e-3] = 1
 
@@ -1478,13 +1505,21 @@ class OpTest(unittest.TestCase):
 
         # comparison of bf16 results will happen as fp32
         # loop over list of grads and convert bf16 to fp32
-        fp32_grads = []
+        fp32_analytic_grads = []
         for grad in analytic_grads:
             if grad.dtype == np.uint16:
                 grad = convert_uint16_to_float(grad)
                 max_relative_error = 0.03
-            fp32_grads.append(grad)
-        analytic_grads = fp32_grads
+            fp32_analytic_grads.append(grad)
+        analytic_grads = fp32_analytic_grads
+
+        fp32_numeric_grads = []
+        for grad in numeric_grads:
+            if grad.dtype == np.uint16:
+                grad = convert_uint16_to_float(grad)
+                max_relative_error = 0.03
+            fp32_numeric_grads.append(grad)
+        numeric_grads = fp32_numeric_grads
 
         self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
                               max_relative_error,
@@ -1494,6 +1529,13 @@ class OpTest(unittest.TestCase):
             dygraph_grad = self._get_dygraph_grad(
                 inputs_to_check, place, output_names, user_defined_grad_outputs,
                 no_grad_set)
+            fp32_grads = []
+            for grad in dygraph_grad:
+                if grad.dtype == np.uint16:
+                    grad = convert_uint16_to_float(grad)
+                    max_relative_error = 0.03
+                fp32_grads.append(grad)
+            dygraph_grad = fp32_grads
             self._assert_is_close(numeric_grads, dygraph_grad, inputs_to_check,
                                   max_relative_error,
                                   "Gradient Check On %s" % str(place))
@@ -1538,6 +1580,21 @@ class OpTest(unittest.TestCase):
                 outputs=outputs,
                 attrs=attrs_outputs if hasattr(self, "attrs") else None)
 
+            if self.dtype == np.uint16:
+                cast_inputs = self._find_var_in_dygraph(outputs,
+                                                        output_names[0])
+                cast_outputs = block.create_var(
+                    dtype="float32", shape=cast_inputs[0].shape)
+                cast_op = block.append_op(
+                    inputs={"X": cast_inputs},
+                    outputs={"Out": cast_outputs},
+                    type="cast",
+                    attrs={
+                        "in_dtype": core.VarDesc.VarType.BF16,
+                        "out_dtype": core.VarDesc.VarType.FP32
+                    })
+                outputs = {output_names[0]: cast_outputs}
+
             outputs_valid = {}
             for output_name in output_names:
                 outputs_valid[output_name] = self._find_var_in_dygraph(
@@ -1653,6 +1710,21 @@ class OpTest(unittest.TestCase):
         feed_dict = self.feed_var(inputs, place)
 
         if user_defined_grad_outputs is None:
+            if self.dtype == np.uint16:
+                cast_inputs = list(map(block.var, output_names))
+                cast_outputs = block.create_var(
+                    dtype="float32", shape=cast_inputs[0].shape)
+                cast_op = block.append_op(
+                    inputs={"X": cast_inputs},
+                    outputs={"Out": cast_outputs},
+                    type="cast",
+                    attrs={
+                        "in_dtype": core.VarDesc.VarType.BF16,
+                        "out_dtype": core.VarDesc.VarType.FP32
+                    })
+                cast_op.desc.infer_var_type(block.desc)
+                cast_op.desc.infer_shape(block.desc)
+                output_names = [cast_outputs.name]
             loss = append_loss_ops(block, output_names)
             param_grad_list = append_backward(
                 loss=loss,
diff --git a/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py
index a62e3c05508a16ce91b60206fe6d275f30c0d7b0..a24c08744821132f60cf342f118ec8c344108729 100644
--- a/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py
+++ b/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py
@@ -65,12 +65,12 @@ class TestRowParallelLinearAPI(TestCollectiveAPIRunnerBase):
 
             linear_out = paddle.distributed.split(
                 data,
-                size=(1000, 8),
+                size=(1000, 16),
                 operation='linear',
                 axis=0,
                 num_partitions=2,
                 weight_attr=param_attr,
-                bias_attr=False, )
+                bias_attr=True, )
 
             return [linear_out]
 
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 31589ca4ae38e83307411ecbc39e8ea815987f03..ef5ac46cede42c57f1581baf097826d534e4ae6a 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -18,7 +18,7 @@ import unittest
 import numpy as np
 from scipy.special import expit, erf
 
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
@@ -1103,12 +1103,19 @@ class TestRelu(TestActivation):
         self.init_dtype()
 
         np.random.seed(1024)
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0)
+        if self.dtype == np.uint16:
+            x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
+            # The same reason with TestAbs
+            x[np.abs(x) < 0.005] = 0.02
+            out = convert_float_to_uint16(np.maximum(x, 0))
+            self.inputs = {'X': convert_float_to_uint16(x)}
+        else:
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            # The same reason with TestAbs
+            x[np.abs(x) < 0.005] = 0.02
+            out = np.maximum(x, 0)
+            self.inputs = {'X': x}
 
-        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -2739,5 +2746,32 @@ create_test_act_fp16_class(TestHardSigmoid)
 create_test_act_fp16_class(TestSwish, grad_atol=0.85)
 create_test_act_fp16_class(TestHardSwish)
 
+
+def create_test_act_bf16_class(parent,
+                               atol=1e-2,
+                               grad_check=True,
+                               grad_atol=0.80):
+    @unittest.skipIf(not paddle.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestActBF16(parent):
+        def init_dtype(self):
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=atol)
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['X'], 'Out', max_relative_error=grad_atol)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "bf16")
+    TestActBF16.__name__ = cls_name
+    globals()[cls_name] = TestActBF16
+
+
+create_test_act_bf16_class(TestRelu)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index cb646ef0b9321373327e50c54826ebda8fd45fb0..1e316c3383ea76f968868fbc7f90ccc898bc61a8 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -404,7 +404,7 @@ class TestAdamOpBetaVariable(OpTest):
 
 class TestAdamOpBetaEpsilonVariable(OpTest):
     def setUp(self):
-        '''Test Adam Op with beta as Variable
+        '''Test Adam Op with beta/epsilon as Variable
         '''
         self.op_type = "adam"
         param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
@@ -450,6 +450,57 @@ class TestAdamOpBetaEpsilonVariable(OpTest):
         self.check_output()
 
 
+class TestAdamOpWithGlobalBetaPow(OpTest):
+    def setUp(self):
+        '''Test Adam Op with global_beta_pow
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+        beta1 = 0.85
+        beta2 = 0.95
+
+        learning_rate = 0.001
+        epsilon = 1e-8
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            "Beta1Tensor": np.array([beta1]).astype("float32"),
+            "Beta2Tensor": np.array([beta2]).astype("float32"),
+            "EpsilonTensor": np.array([epsilon]).astype("float32"),
+        }
+
+        attributes = {'epsilon': epsilon}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
+
+        self.attrs = {'use_global_beta_pow': True}
+
+        # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([]),
+            'Beta2PowOut': np.array([])
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAdamOpV2(unittest.TestCase):
     def test_adam_op(self):
         place = fluid.CPUPlace()
@@ -493,6 +544,7 @@ class TestAdamOpV2(unittest.TestCase):
         out.backward()
         adam.step()
         adam.clear_gradients()
+        paddle.enable_static()
 
     def test_adam_op_with_state_dict(self):
 
@@ -523,6 +575,7 @@ class TestAdamOpV2(unittest.TestCase):
 
         params = adam.get_opti_var_name_list()
         assert (params is not None)
+        paddle.enable_static()
 
     def test_adam_with_grad_clip(self):
         paddle.disable_static()
@@ -536,6 +589,7 @@ class TestAdamOpV2(unittest.TestCase):
         out.backward()
         adam.step()
         adam.clear_gradients()
+        paddle.enable_static()
 
     def test_adam_op_with_set_lr(self):
         paddle.disable_static()
@@ -550,6 +604,7 @@ class TestAdamOpV2(unittest.TestCase):
             lr_var = paddle.fluid.layers.create_global_var(
                 shape=[1], value=lr, dtype='float32')
             adam.set_lr(lr_var)
+        paddle.enable_static()
 
     def test_adam_op_invalid_input(self):
         paddle.disable_static()
@@ -563,6 +618,7 @@ class TestAdamOpV2(unittest.TestCase):
         with self.assertRaises(ValueError):
             adam = paddle.optimizer.Adam(
                 0.1, epsilon=-1, parameters=linear.parameters())
+        paddle.enable_static()
 
     def test_adam_op_with_sparse_input_and_weight_decay(self):
 
@@ -577,10 +633,15 @@ class TestAdamOpV2(unittest.TestCase):
             out = emb(x)
             out.backward()
             adam.step()
+        paddle.enable_static()
 
 
 class TestNetWithEpsilonTensor(unittest.TestCase):
-    def _test(self, place, use_tensor=True, use_fluid_api=True):
+    def _test(self,
+              place,
+              use_tensor=True,
+              use_fluid_api=True,
+              use_global_beta_pow=False):
         paddle.enable_static()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -633,7 +694,8 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
                         learning_rate=0.01,
                         beta1=beta1,
                         beta2=beta2,
-                        epsilon=epsilon)
+                        epsilon=epsilon,
+                        use_global_beta_pow=use_global_beta_pow)
                 else:
                     adam = paddle.optimizer.Adam(
                         learning_rate=0.01,
@@ -646,7 +708,9 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
                         learning_rate=0.01,
                         beta1=beta1_init,
                         beta2=beta2_init,
-                        epsilon=epsilon_init)
+                        epsilon=epsilon_init,
+                        use_global_beta_pow=use_global_beta_pow,
+                        name='a')
                 else:
                     adam = fluid.optimizer.Adam(
                         learning_rate=0.01,
@@ -680,9 +744,11 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
 
         for use_tensor in [True, False]:
             for use_fluid_api in [True, False]:
-                pred, loss = self._test(place, use_tensor, use_fluid_api)
-                preds.append(pred)
-                losses.append(loss)
+                for use_global_beta_pow in [True, False]:
+                    pred, loss = self._test(place, use_tensor, use_fluid_api,
+                                            use_global_beta_pow)
+                    preds.append(pred)
+                    losses.append(loss)
         for pred in preds:
             self.assertTrue(np.allclose(pred, preds[0]))
         for loss in losses:
@@ -694,6 +760,55 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
         if core.is_compiled_with_cuda():
             self._test_with_place(paddle.CUDAPlace(0))
 
+    def test_adam_exception(self):
+        paddle.enable_static()
+        a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+        b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+        label = paddle.static.data(name="label", shape=[32, 1], dtype='int64')
+
+        sum = paddle.add(a, b)
+        z = paddle.pow(sum, 2.0)
+
+        fc_1 = fluid.layers.fc(input=z, size=128)
+        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        loss = fluid.layers.reduce_mean(cost)
+        adam = fluid.optimizer.Adam(use_global_beta_pow=True)
+        adam.minimize(loss)
+        self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')
+        adam._add_global_accumulator(
+            'tmp', type=core.VarDesc.VarType.LOD_TENSOR)
+        adam._get_global_accumulator('tmp')
+        self.assertRaises(
+            Exception,
+            adam._add_global_accumulator,
+            adam._beta1_pow_acc_str,
+            type=core.VarDesc.VarType.LOD_TENSOR)
+        paddle.disable_static()
+
+    def test_adam_save_load(self):
+        paddle.disable_static()
+        a = paddle.rand([4, 10])
+        linear = paddle.nn.Linear(10, 10)
+        b = linear(a)
+        state_dict = linear.state_dict()
+        fluid.save_dygraph(state_dict, "paddle_dy")
+
+        scheduler = paddle.optimizer.lr.NoamDecay(
+            d_model=0.01, warmup_steps=100, verbose=True)
+        adam = paddle.fluid.optimizer.Adam(
+            learning_rate=scheduler,
+            parameter_list=linear.parameters(),
+            use_global_beta_pow=True)
+        adam.minimize(b)
+        state_dict = adam.state_dict()
+        fluid.save_dygraph(state_dict, "paddle_dy")
+        para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
+        adam.set_state_dict(opti_state_dict)
+
+        paddle.enable_static()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_asp_utils.py b/python/paddle/fluid/tests/unittests/test_asp_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..faffd477ae5661cee5e599b2044af4f42a96112f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_asp_utils.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import threading, time
+import paddle
+from paddle.fluid.contrib import sparsity
+import numpy as np
+
+
+class TestASPUtils(unittest.TestCase):
+    def test_get_check_method(self):
+        self.assertEqual(
+            sparsity.CheckMethod.get_checking_method(sparsity.MaskAlgo.MASK_1D),
+            sparsity.CheckMethod.CHECK_1D)
+        self.assertEqual(
+            sparsity.CheckMethod.get_checking_method(
+                sparsity.MaskAlgo.MASK_2D_GREEDY),
+            sparsity.CheckMethod.CHECK_2D)
+        self.assertEqual(
+            sparsity.CheckMethod.get_checking_method(
+                sparsity.MaskAlgo.MASK_2D_BEST), sparsity.CheckMethod.CHECK_2D)
+
+    def test_density(self):
+        x = np.array([[1.0, 1.0, 1.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
+                      [1.0, 0.0, 0.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
+                      [0.0, 1.0, 0.0, 0.0, 1.0]])
+        self.assertEqual(sparsity.density(x), 0.56)
+        x[:, 0] = 0.0
+        self.assertEqual(sparsity.density(x), 0.4)
+
+    def test_check_mask_1d(self):
+        x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
+                      [1.0, 1.0, 0.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
+                      [0.0, 1.0, 0.0, 0.0, 1.0]])
+        self.assertTrue(sparsity.check_mask_1d(x, 2, 4))
+        self.assertFalse(sparsity.check_mask_1d(x, 3, 4))
+        self.assertTrue(sparsity.check_mask_1d(x, 2, 5))
+        self.assertFalse(sparsity.check_mask_1d(x, 3, 5))
+        self.assertTrue(sparsity.check_mask_1d(x, 3, 6))
+        self.assertFalse(sparsity.check_mask_1d(x, 4, 6))
+
+    def test_get_mask_1d(self):
+        for _ in range(10):
+            x = np.random.randint(10, size=(5, 5))
+            x = sparsity.get_mask_1d(x, 2, 4)
+            self.assertTrue(sparsity.check_mask_1d(x, 2, 4))
+
+            x = np.random.randn(5, 4)
+            x = sparsity.get_mask_1d(x, 2, 4)
+            self.assertTrue(sparsity.check_mask_1d(x, 2, 4))
+
+    def test_check_mask_2d(self):
+        x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [0.0, 1.0, 0.0, 0.0, 0.0],
+                      [0.0, 0.0, 1.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 0.0],
+                      [0.0, 1.0, 0.0, 0.0, 1.0]])
+        self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+        self.assertFalse(sparsity.check_mask_2d(x, 3, 4))
+        self.assertTrue(sparsity.check_mask_2d(x, 2, 5))
+        self.assertFalse(sparsity.check_mask_2d(x, 3, 5))
+        self.assertTrue(sparsity.check_mask_2d(x, 3, 6))
+        self.assertFalse(sparsity.check_mask_2d(x, 4, 6))
+
+    def test_get_mask_2d_greedy(self):
+        for _ in range(10):
+            x = np.random.randint(10, size=(5, 5))
+            x = sparsity.get_mask_2d_greedy(x, 2, 4)
+            self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+
+            x = np.random.randn(5, 4)
+            x = sparsity.get_mask_2d_greedy(x, 2, 4)
+            self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+
+    def test_get_mask_2d_best(self):
+        for _ in range(10):
+            x = np.random.randint(10, size=(5, 5))
+            x = sparsity.get_mask_2d_best(x, 2, 4)
+            self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+
+            x = np.random.randn(5, 4)
+            x = sparsity.get_mask_2d_best(x, 2, 4)
+            self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+
+    def test_threadsafe_valid_2d_patterns(self):
+        def get_reference(m=4, n=2):
+            from itertools import permutations
+
+            patterns = np.zeros(m)
+            patterns[:n] = 1
+            patterns = list(set(permutations(patterns.tolist())))
+            patterns = patterns + patterns
+            patterns = np.asarray(list(set(permutations(patterns, m))))
+
+            valid = ((patterns.sum(axis=1) <= n).sum(axis=1) == m
+                     ).nonzero()[0].reshape(-1)
+            valid_patterns = np.empty((valid.shape[0], m, m))
+            valid_patterns[:] = patterns[valid[:]]
+            return valid_patterns
+
+        for _ in range(4):
+            computing_thread = threading.Thread(
+                target=paddle.fluid.contrib.sparsity.utils.
+                compute_valid_2d_patterns,
+                args=(2, 4))
+            computing_thread.start()
+        time.sleep(3)
+        patterns_map = paddle.fluid.contrib.sparsity.utils.valid_2d_patterns
+        reference_patterns = get_reference()
+        reference_key = '4_2'
+
+        self.assertTrue(reference_key in patterns_map)
+        self.assertTrue(len(patterns_map) == 1)
+        self.assertTrue((reference_patterns == patterns_map[reference_key]).all(
+        ))
+
+    def test_check_sparsity(self):
+        for _ in range(10):
+            x = np.random.randint(10, size=(5))
+            x_2d = x.reshape(1, x.shape[0])
+            self.__test_1D_2D_sparsity_checking_methods(x_2d)
+
+            x = np.random.randint(10, size=(5, 5))
+            x_2d = x
+            self.__test_1D_2D_sparsity_checking_methods(x_2d)
+
+            x = np.random.randint(10, size=(5, 5, 5))
+            x_2d = x.reshape(x.shape[0] * x.shape[1], x.shape[2])
+            self.__test_1D_2D_sparsity_checking_methods(x_2d)
+
+            x = np.random.randint(10, size=(5, 5, 5, 5))
+            x_2d = x.reshape(x.shape[0], x.shape[1] * x.shape[2] * x.shape[3])
+            self.__test_1D_2D_sparsity_checking_methods(x_2d)
+
+    def test_create_mask(self):
+        for _ in range(10):
+            x = np.random.randint(10, size=(5))
+            self.__test_1D_2D_sparse_mask_generation_methods(x)
+
+            x = np.random.randint(10, size=(5, 5))
+            self.__test_1D_2D_sparse_mask_generation_methods(x)
+
+            x = np.random.randint(10, size=(5, 5, 5))
+            self.__test_1D_2D_sparse_mask_generation_methods(x)
+
+            x = np.random.randint(10, size=(5, 5, 5, 5))
+            self.__test_1D_2D_sparse_mask_generation_methods(x)
+
+    def __test_1D_2D_sparsity_checking_methods(self, x_2d):
+        mask = sparsity.get_mask_1d(x_2d, 2, 4)
+        self.assertEqual(
+            sparsity.check_sparsity(
+                mask, func_name=sparsity.CheckMethod.CHECK_1D, n=2, m=4),
+            sparsity.check_mask_1d(mask, 2, 4))
+        mask = sparsity.get_mask_2d_best(x_2d, 2, 4)
+        self.assertEqual(
+            sparsity.check_sparsity(
+                mask, func_name=sparsity.CheckMethod.CHECK_2D, n=2, m=4),
+            sparsity.check_mask_2d(mask, 2, 4))
+
+    def __test_1D_2D_sparse_mask_generation_methods(self, x):
+        mask = sparsity.create_mask(
+            x, func_name=sparsity.MaskAlgo.MASK_1D, n=2, m=4)
+        self.assertTrue(
+            sparsity.check_sparsity(
+                mask, func_name=sparsity.CheckMethod.CHECK_1D, n=2, m=4))
+        mask = sparsity.create_mask(
+            x, func_name=sparsity.MaskAlgo.MASK_2D_GREEDY, n=2, m=4)
+        self.assertTrue(
+            sparsity.check_sparsity(
+                mask, func_name=sparsity.CheckMethod.CHECK_2D, n=2, m=4))
+        mask = sparsity.create_mask(
+            x, func_name=sparsity.MaskAlgo.MASK_2D_BEST, n=2, m=4)
+        self.assertTrue(
+            sparsity.check_sparsity(
+                mask, func_name=sparsity.CheckMethod.CHECK_2D, n=2, m=4))
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index e6693b676cf6430b329d83dc5ff8c99bc1f131e9..f0c042eb7e95b69e7fa894df6c06e5a6fb649588 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -154,7 +154,10 @@ class TestDistBase(unittest.TestCase):
         #update environment
         env0.update(envs)
         env1.update(envs)
-        tr_cmd = "%s %s"
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            tr_cmd = "%s -m coverage run --branch -p %s"
+        else:
+            tr_cmd = "%s %s"
         tr0_cmd = tr_cmd % (self._python_interp, model_file)
         tr1_cmd = tr_cmd % (self._python_interp, model_file)
         tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index 8dc80c893126925ff0643b4cde622fe504c6b1d9..a2dd7e49ac4ccdd6135a27d7b88f6fbdec2132b9 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -139,6 +139,22 @@ def create_paddle_case(op_type, callback):
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
+        def test_broadcast_api_3(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[5], dtype='int32')
+                y = paddle.static.data(name='y', shape=[3, 1], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 5).reshape((5)).astype(np.int32)
+                input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(np.int32)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
         def test_attr_name(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_complex_cast.py b/python/paddle/fluid/tests/unittests/test_complex_cast.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4162be5b3691d59451a97778f9cf0d55aa4d532
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_complex_cast.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division
+
+import unittest
+import numpy as np
+
+import paddle
+
+
+class TestComplexCastOp(unittest.TestCase):
+    def test_complex_to_real(self):
+        r = np.random.random(size=[10, 10]) * 10
+        i = np.random.random(size=[10, 10])
+
+        c_t = paddle.to_tensor(r + i * 1J, dtype='complex64')
+
+        self.assertEqual(c_t.cast('int64').dtype, paddle.int64)
+        self.assertEqual(c_t.cast('int32').dtype, paddle.int32)
+        self.assertEqual(c_t.cast('float32').dtype, paddle.float32)
+        self.assertEqual(c_t.cast('float64').dtype, paddle.float64)
+        self.assertEqual(c_t.cast('bool').dtype, paddle.bool)
+
+        self.assertTrue(
+            np.allclose(c_t.cast('int64').numpy(), r.astype('int64')))
+        self.assertTrue(
+            np.allclose(c_t.cast('int32').numpy(), r.astype('int32')))
+        self.assertTrue(
+            np.allclose(c_t.cast('float32').numpy(), r.astype('float32')))
+        self.assertTrue(
+            np.allclose(c_t.cast('float64').numpy(), r.astype('float64')))
+        self.assertTrue(np.allclose(c_t.cast('bool').numpy(), r.astype('bool')))
+
+    def test_real_to_complex(self):
+        r = np.random.random(size=[10, 10]) * 10
+        r_t = paddle.to_tensor(r)
+
+        self.assertEqual(r_t.cast('complex64').dtype, paddle.complex64)
+        self.assertEqual(r_t.cast('complex128').dtype, paddle.complex128)
+
+        self.assertTrue(np.allclose(r_t.cast('complex64').real().numpy(), r))
+        self.assertTrue(np.allclose(r_t.cast('complex128').real().numpy(), r))
+
+    def test_complex64_complex128(self):
+        r = np.random.random(size=[10, 10])
+        i = np.random.random(size=[10, 10])
+
+        c = r + i * 1J
+        c_64 = paddle.to_tensor(c, dtype='complex64')
+        c_128 = paddle.to_tensor(c, dtype='complex128')
+
+        self.assertTrue(c_64.cast('complex128').dtype, paddle.complex128)
+        self.assertTrue(c_128.cast('complex128').dtype, paddle.complex64)
+        self.assertTrue(
+            np.allclose(c_64.cast('complex128').numpy(), c_128.numpy()))
+        self.assertTrue(
+            np.allclose(c_128.cast('complex128').numpy(), c_64.numpy()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
index 9bc48ac0a1b2d4eca90acc1cd9792696bfcb7a2e..eae19afb2ef86c10e4ba0997efbbfcf4933f45e2 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
@@ -25,12 +25,15 @@ flag_name = os.path.splitext(__file__)[0]
 
 
 def count_of_sparse_all_reduce_calls(file_name):
-    cmd = 'grep sparse_all_reduce_op_handle ' + file_name + ' | grep in_numel | wc -l'
+    # NOTE(Aurelius84): The log file contains some binary contents that causes error
+    # while `grep`. So we add `-a` to fix it.
+    # -a, --text equivalent to --binary-files=text, make binaries equivalent to text.
+    cmd = 'grep -a sparse_all_reduce_op_handle ' + file_name + ' | grep in_numel | wc -l'
     child = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
     result = child.communicate()[0]
     print('test_info: result = ' + str(result))
 
-    # note. in python3, result is b'num', != 'num' 
+    # NOTE: in python3, result is b'num', != 'num'
     return int(result)
 
 
@@ -59,7 +62,7 @@ class TestDistMnistNCCL2DGC(TestDistBase):
             # only 1 layer use dgc now, run_step=5, rampup_begin_step=2, so 1 * (5 - 2) = 3
 
             # temp close this test. In python3 CI, the log is right, but the result
-            # has a problem, may be in multi process mode, log is not writed in time.  
+            # has a problem, may be in multi process mode, log is not written in time.
             # self.assertEqual(result, 3)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
index 05da44cd061331ff9a8e15d3095bec3bdf6965fb..628f1db80d2d460f9220f6e3b63d4a54b4ba55b4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
@@ -80,15 +80,17 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
                 cost_val = exe.run(feed=gen_data(), fetch_list=[avg_cost.name])
                 print("cost of step[{}] = {}".format(i, cost_val))
 
-        proc_a = launch_func(node_func, node_a)
-        proc_a.start()
+        # rank 1
+        proc_b = launch_func(node_func, node_b)
+        proc_b.start()
 
+        # rank 0, for wait server ready coverage
         # just for coverage
-        for key in node_b:
-            os.environ[key] = node_b[key]
+        for key in node_a:
+            os.environ[key] = node_a[key]
         node_func()
 
-        proc_a.join()
+        proc_b.join()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_launch_coverage.py b/python/paddle/fluid/tests/unittests/test_launch_coverage.py
index 43613928585e77035d7d405996b3b4940d953d08..9fbf27e3c1d063452d2bc75805c45fbd2a0959d2 100644
--- a/python/paddle/fluid/tests/unittests/test_launch_coverage.py
+++ b/python/paddle/fluid/tests/unittests/test_launch_coverage.py
@@ -24,6 +24,7 @@ import paddle.fluid as fluid
 
 from argparse import ArgumentParser, REMAINDER
 from paddle.distributed.utils import _print_arguments, get_gpus, get_cluster_from_args
+from paddle.distributed.fleet.launch_utils import find_free_ports
 
 
 def _parse_args():
@@ -115,6 +116,9 @@ class TestCoverage(unittest.TestCase):
         args.use_paddlecloud = True
         cluster, pod = get_cluster_from_args(args, "0")
 
+    def test_find_free_ports(self):
+        find_free_ports(2)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py
index c48ec2a4fb458cc3028f8c6a92eb39553e215450..31c68b88b86a77b8d1da668602fd9fd5c1e5e075 100644
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -50,15 +50,30 @@ class TestLogsumexp(OpTest):
             'keepdim': self.keepdim,
             'reduce_all': self.reduce_all
         }
+        self.user_defined_grads = None
+        self.user_defined_grad_outputs = None
+        self.set_attrs_addition()
 
     def set_attrs(self):
         pass
 
+    def set_attrs_addition(self):
+        pass
+
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], ['Out'])
+        self.check_grad(
+            ['X'], ['Out'],
+            user_defined_grads=self.user_defined_grads,
+            user_defined_grad_outputs=self.user_defined_grad_outputs)
+
+    def calc_grad(self):
+        dy = np.ones(1, dtype=self.dtype)
+        x = self.inputs['X']
+        y = self.outputs['Out']
+        return dy * np.exp(x - y)
 
 
 class TestLogsumexp_shape(TestLogsumexp):
@@ -75,6 +90,11 @@ class TestLogsumexp_axis_all(TestLogsumexp):
     def set_attrs(self):
         self.axis = [0, 1, 2, 3]
 
+    def set_attrs_addition(self):
+        if paddle.fluid.core.is_compiled_with_rocm():
+            self.user_defined_grads = [self.calc_grad()]
+            self.user_defined_grad_outputs = [np.ones(1, dtype=self.dtype)]
+
 
 class TestLogsumexp_keepdim(TestLogsumexp):
     def set_attrs(self):
@@ -85,6 +105,11 @@ class TestLogsumexp_reduce_all(TestLogsumexp):
     def set_attrs(self):
         self.reduce_all = True
 
+    def set_attrs_addition(self):
+        if paddle.fluid.core.is_compiled_with_rocm():
+            self.user_defined_grads = [self.calc_grad()]
+            self.user_defined_grad_outputs = [np.ones(1, dtype=self.dtype)]
+
 
 class TestLogsumexpError(unittest.TestCase):
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index 3a5c43b2bab3ed75dda7c2f0e8daabcb73cc786b..be2a6a653cc6f40e6e606200b307c7755a8f9559 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -412,11 +412,10 @@ class TestSaveLoadAny(unittest.TestCase):
         ]
         obj2 = {'k1': obj1, 'k2': state_dict, 'epoch': 123}
         obj3 = (paddle.randn(
-            [5, 4], dtype='float32'), np.ndarray(
-                [3, 4], dtype="float32"), {
-                    "state_dict": state_dict,
-                    "opt": state_dict
-                })
+            [5, 4], dtype='float32'), np.random.randn(3, 4).astype("float32"), {
+                "state_dict": state_dict,
+                "opt": state_dict
+            })
         obj4 = (np.random.randn(5, 6), (123, ))
 
         path1 = "test_save_load_any_complex_object_dygraph/obj1"
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
index 8b508d5c9ae79801fa6c0c865650205ca6932b4f..7385da56beab3eb9517e10f7cd6e37741daffbdf 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
@@ -19,6 +19,7 @@ import numpy as np
 import os
 import sys
 import six
+import platform
 
 import paddle
 import paddle.nn as nn
@@ -162,12 +163,13 @@ class TestSaveLoadBinaryFormat(unittest.TestCase):
         with self.assertRaises(NotImplementedError):
             path = 'test_save_load_error/temp'
             paddle.save({}, path, use_binary_format=True)
-
-        with self.assertRaises(ValueError):
-            path = 'test_save_load_error/temp'
-            with open(path, "w") as f:
-                f.write('\0')
-            paddle.load(path)
+        # On the Windows platform, when parsing a string that can't be parsed as a `Program`, `desc_.ParseFromString` has a timeout risk.
+        if 'Windows' != platform.system():
+            with self.assertRaises(ValueError):
+                path = 'test_save_load_error/temp'
+                with open(path, "w") as f:
+                    f.write('\0')
+                paddle.load(path)
 
         with self.assertRaises(ValueError):
             temp_lod = fluid.core.LoDTensor()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
index 5491b451368c825c10f1e957d85e30ccacdd1dc7..f3cd97ee1ec86916ecebb8ddf0895443c6e14567 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -17,8 +17,11 @@ from __future__ import print_function
 import unittest
 import time
 import paddle.fluid as fluid
+import copy
+import os
+import subprocess
 
-from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, start_local_trainers
+from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
 
 
 def get_cluster_from_args(selected_gpus):
@@ -46,6 +49,55 @@ def get_gpus(selected_gpus):
     return selected_gpus
 
 
+def start_local_trainers(cluster,
+                         pod,
+                         training_script,
+                         training_script_args,
+                         log_dir=None):
+    current_env = copy.copy(os.environ.copy())
+    #paddle broadcast ncclUniqueId use socket, and
+    #proxy maybe make trainers unreachable, so delete them.
+    #if we set them to "", grpc will log error message "bad uri"
+    #so just delete them.
+    current_env.pop("http_proxy", None)
+    current_env.pop("https_proxy", None)
+
+    procs = []
+    for t in pod.trainers:
+        proc_env = {
+            "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]),
+            "PADDLE_TRAINER_ID": "%d" % t.rank,
+            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
+            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+        }
+
+        current_env.update(proc_env)
+
+        print("trainer proc env:{}".format(current_env))
+
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            cmd = "python -m coverage run --branch -p " + training_script
+        else:
+            cmd = "python -u " + training_script
+
+        print("start trainer proc:{} env:{}".format(cmd, proc_env))
+
+        fn = None
+
+        proc = subprocess.Popen(cmd.split(" "), env=current_env)
+
+        tp = TrainerProc()
+        tp.proc = proc
+        tp.rank = t.rank
+        tp.log_fn = fn
+        tp.cmd = cmd
+
+        procs.append(tp)
+
+    return procs
+
+
 class TestMultipleGpus(unittest.TestCase):
     def run_mnist_2gpu(self, target_file_name):
         if not fluid.core.is_compiled_with_cuda(
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
similarity index 89%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py
rename to python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
index f3b89d694f70b96df70f4923b5af3433c7e2e26c..1d06e168208b279ccfba753452b1a82e5034975f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
@@ -24,6 +24,9 @@ class TestHybridPipeParallel(TestMultipleGpus):
     def test_hybrid_parallel_pp_layer(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_layer.py')
 
+    def test_hybrid_parallel_pp_tuple_inputs(self):
+        self.run_mnist_2gpu('hybrid_parallel_pp_embedding.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
rename to python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
index 7f8294ad0efe7536a27024fd30dbcdda15220efd..f62e160673f8d22ee895fe357d25c665859130c1 100644
--- a/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
@@ -22,7 +22,7 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus
 
 class TestPipelineParallel(TestMultipleGpus):
     def test_pipeline_parallel(self):
-        self.run_mnist_2gpu('hybrid_parallel_pp_model.py')
+        self.run_mnist_2gpu('hybrid_parallel_pp_alexnet.py')
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 83f02b629d7acdf8de27ecc506e62701bf890d17..b3671327ca2959e9331bb4875fabee19f72010ae 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -248,6 +248,21 @@ class TestVarBase(unittest.TestCase):
                 a = paddle.to_tensor(a, place=paddle.CUDAPinnedPlace())
                 self.assertEqual(a.place.__repr__(), "CUDAPinnedPlace")
 
+    def test_to_tensor_with_lodtensor(self):
+        if core.is_compiled_with_cuda():
+            a_np = np.random.rand(1024, 1024)
+            with paddle.fluid.dygraph.guard(core.CPUPlace()):
+                lod_tensor = core.LoDTensor()
+                lod_tensor.set(a_np, core.CPUPlace())
+                a = paddle.to_tensor(lod_tensor)
+                self.assertTrue(np.array_equal(a_np, a.numpy()))
+
+            with paddle.fluid.dygraph.guard(core.CUDAPlace(0)):
+                lod_tensor = core.LoDTensor()
+                lod_tensor.set(a_np, core.CUDAPlace(0))
+                a = paddle.to_tensor(lod_tensor)
+                self.assertTrue(np.array_equal(a_np, a.numpy()))
+
     def test_to_variable(self):
         with fluid.dygraph.guard():
             var = fluid.dygraph.to_variable(self.array, name="abc")
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 690ac46e563ef04a7ea416aa9615441245f6a56b..71051689dbc1572600e34d920e1b1f43ca2e9524 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -15,12 +15,15 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_, in_dygraph_mode
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 import numpy as np
 
+paddle.enable_static()
+
 
 class TestVariable(unittest.TestCase):
     def test_np_dtype_convert(self):
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index 844115d4acecc40b92e2f14dbc8f2a4f3df920dd..24c463ebfc9a1336c6a7eea19d4190db17d6f08c 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -36,7 +36,8 @@ def YoloBox(x, img_size, attrs):
     clip_bbox = attrs['clip_bbox']
     scale_x_y = attrs['scale_x_y']
     bias_x_y = -0.5 * (scale_x_y - 1.)
-    input_size = downsample * h
+    input_h = downsample * h
+    input_w = downsample * w
 
     x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
 
@@ -50,7 +51,7 @@ def YoloBox(x, img_size, attrs):
 
     anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
     anchors_s = np.array(
-        [(an_w / input_size, an_h / input_size) for an_w, an_h in anchors])
+        [(an_w / input_w, an_h / input_h) for an_w, an_h in anchors])
     anchor_w = anchors_s[:, 0:1].reshape((1, an_num, 1, 1))
     anchor_h = anchors_s[:, 1:2].reshape((1, an_num, 1, 1))
     pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
@@ -191,5 +192,19 @@ class TestYoloBoxStatic(unittest.TestCase):
         assert boxes is not None and scores is not None
 
 
+class TestYoloBoxOpHW(TestYoloBoxOp):
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int(len(self.anchors) // 2)
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = False
+        self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 9)
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
index 9bea33e484e19c0d9a9a2b94af4b545c7f7fb4b7..d33cb2157b03bead1f870b9a2536f5e47d28ee36 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
@@ -13,13 +13,18 @@
 # limitations under the License.
 
 from __future__ import print_function
+import unittest
 import sys
 sys.path.append("..")
-import unittest
+
 import numpy as np
-from op_test import OpTest
+
 import paddle
 import paddle.fluid as fluid
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+
+paddle.enable_static()
 
 
 def gather_numpy(x, index, axis):
@@ -29,37 +34,12 @@ def gather_numpy(x, index, axis):
     return gather
 
 
-class TestGatherOp(OpTest):
-    def setUp(self):
-        self.op_type = "gather"
-        self.config()
-        xnp = np.random.random(self.x_shape).astype(self.x_type)
-        self.inputs = {
-            'X': xnp,
-            'Index': np.array(self.index).astype(self.index_type)
-        }
-        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-    def config(self):
-        """
-        For multi-dimension input
-        """
-        self.x_shape = (10, 20)
-        self.x_type = "float64"
-        self.index = [1, 3, 5]
-        self.index_type = "int32"
-
-
-class TestXPUGatherOp(OpTest):
+class TestXPUGatherOp(XPUOpTest):
     def setUp(self):
+        self.dtype = "float32"
         self.op_type = "gather"
-        self.dtype = np.float32
+        self.use_xpu = True
+        self.use_mkldnn = False
         self.attrs = {'use_xpu': True}
 
         self.config()
@@ -71,12 +51,12 @@ class TestXPUGatherOp(OpTest):
         self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
 
     def test_check_output(self):
-        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+        if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
             self.check_output_with_place(place)
 
     def test_check_grad(self):
-        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+        if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
             self.check_grad_with_place(place, ['X'], 'Out')
 
@@ -85,7 +65,7 @@ class TestXPUGatherOp(OpTest):
         For multi-dimension input
         """
         self.x_shape = (10, 20)
-        self.x_type = self.dtype
+        self.x_type = "float32"
         self.index = [1, 3, 5]
         self.index_type = "int32"
 
@@ -150,5 +130,14 @@ class TestCase6(TestXPUGatherOp):
         self.index_type = "int32"
 
 
+class TestCase7(TestXPUGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {'use_xpu': True, 'overwrite': True}
+        self.x_type = "float32"
+        self.index = [1, 3]
+        self.index_type = "int64"
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4e1363bd9c94df188d64375f4617691638c846d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py
@@ -0,0 +1,97 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+import sys
+sys.path.append("..")
+import numpy as np
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+
+paddle.enable_static()
+
+
+def ref_logsumexp(x, axis=None, keepdim=False, reduce_all=False):
+    if isinstance(axis, int):
+        axis = (axis, )
+    elif isinstance(axis, list):
+        axis = tuple(axis)
+    if reduce_all:
+        axis = None
+    out = np.log(np.exp(x).sum(axis=axis, keepdims=keepdim))
+    return out
+
+
+class XPUTestLogsumexp(XPUOpTest):
+    def setUp(self):
+        self.op_type = 'logsumexp'
+        self.shape = [2, 3, 4, 5]
+        self.dtype = 'float32'
+        self.axis = [-1]
+        self.keepdim = False
+        self.reduce_all = False
+        self.set_attrs()
+
+        np.random.seed(10)
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out = ref_logsumexp(x, self.axis, self.keepdim, self.reduce_all)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {
+            'axis': self.axis,
+            'keepdim': self.keepdim,
+            'reduce_all': self.reduce_all
+        }
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestLogsumexp_shape(XPUTestLogsumexp):
+    def set_attrs(self):
+        self.shape = [4, 5, 6]
+
+
+class TestLogsumexp_axis(XPUTestLogsumexp):
+    def set_attrs(self):
+        self.axis = [0, -1]
+
+
+class TestLogsumexp_axis_all(XPUTestLogsumexp):
+    def set_attrs(self):
+        self.axis = [0, 1, 2, 3]
+
+
+class TestLogsumexp_keepdim(XPUTestLogsumexp):
+    def set_attrs(self):
+        self.keepdim = True
+
+
+class TestLogsumexp_reduce_all(XPUTestLogsumexp):
+    def set_attrs(self):
+        self.reduce_all = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..242b5b14db2bcc3e32f9d5d7144d50a822cb99c6
--- /dev/null
+++ b/python/paddle/fluid/variable_index.py
@@ -0,0 +1,306 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import numpy as np
+from . import unique_name
+from . import core
+
+MAX_INTEGER = 2**31 - 1
+
+
+def replace_ellipsis(var, item):
+    from .framework import Variable
+    # Use slice(None) to replace Ellipsis.
+    # For var, var.shape = [3,4,5,6]
+    #
+    #   var[..., 1:2] -> var[:, :, :, 1:2]
+    #   var[0, ...] -> var[0]
+    #   var[0, ..., 1:2] -> var[0, :, :, 1:2]
+
+    item = list(item)
+
+    # Remove Variable to skip bug when counting Ellipsis
+    item_remove_var = [ele for ele in item if not isinstance(ele, Variable)]
+    ell_count = item_remove_var.count(Ellipsis)
+    if ell_count == 0:
+        return item
+    elif ell_count > 1:
+        raise IndexError("An index can only have a single ellipsis ('...')")
+
+    ell_idx = item.index(Ellipsis)
+
+    if ell_idx == len(item) - 1:
+        return item[:-1]
+    else:
+        item[ell_idx:ell_idx + 1] = [slice(None)] * (
+            len(var.shape) - len(item) + 1)
+
+    return item
+
+
+def is_integer_or_scalar_tensor(ele):
+    from .framework import Variable
+    if isinstance(ele, int):
+        return True
+    elif isinstance(ele, Variable):
+        if len(ele.shape) == 1 and ele.shape[0] == 1:
+            return True
+    return False
+
+
+def deal_attrs(attrs, attr, attr_name, tensor_attr_name, inputs, infer_flags):
+    from .framework import Variable
+    from .layers import utils
+
+    if utils._contain_var(attr):
+        inputs[tensor_attr_name] = utils._convert_to_tensor_list(
+            attr, dtype="int64")
+        for i, dim in enumerate(attr):
+            if isinstance(dim, Variable):
+                attrs[attr_name].append(-1)
+                infer_flags[i] = -1
+            else:
+                attrs[attr_name].append(dim)
+    else:
+        attrs[attr_name] = attr
+
+
+def _getitem_impl_(var, item):
+    """
+    Slice the variable.
+
+    Args:
+        item(int/slice/tuple) : the index.
+
+    Returns:
+        Sliced variable
+    """
+    from .framework import default_main_program
+
+    if not isinstance(item, tuple):
+        item = (item, )
+
+    decrease_axes = []
+    axes = []
+    starts = []
+    ends = []
+    steps = []
+    reverse_axis = []
+
+    use_strided_slice = False
+
+    for dim, slice_item in enumerate(item):
+        if is_integer_or_scalar_tensor(slice_item):
+            decrease_axes.append(dim)
+            start = slice_item
+            step = 1
+            end = slice_item + 1 if slice_item != -1 else MAX_INTEGER
+
+        elif isinstance(slice_item, slice):
+            start = slice_item.start
+            end = slice_item.stop
+            step = slice_item.step
+
+            if start is None and end is None and step is None:
+                continue
+
+            step = 1 if step is None else step
+
+            if start is None and end is None:
+                assert (step == -1)
+                reverse_axis.append(dim)
+                continue
+
+            start = 0 if start is None else start
+            end = MAX_INTEGER if end is None else end
+
+        else:
+            raise IndexError(
+                "Valid index accept int or slice or ellipsis, but received {}.".
+                format(slice_item))
+
+        axes.append(dim)
+        starts.append(start)
+        ends.append(end)
+        steps.append(step)
+        use_strided_slice = True if step != 1 else use_strided_slice
+
+    inputs = {'Input': [var]}
+    attrs = {
+        'axes': axes,
+        'starts': [],
+        'ends': [],
+        'decrease_axis': decrease_axes
+    }
+    if use_strided_slice:
+        attrs['strides'] = []
+
+    infer_flags = [1] * len(axes)
+    deal_attrs(attrs, starts, "starts", "StartsTensorList", inputs, infer_flags)
+    deal_attrs(attrs, ends, "ends", "EndsTensorList", inputs, infer_flags)
+    deal_attrs(attrs, steps, "strides", "StridesTensorList", inputs,
+               infer_flags)
+    attrs['infer_flags'] = infer_flags
+
+    out = var
+    if len(axes) > 0:
+        target_block = default_main_program().current_block()
+        op_type = "strided_slice" if use_strided_slice else "slice"
+
+        slice_out_var = target_block.create_var(
+            name=unique_name.generate_with_ignorable_key(var.name + "_" +
+                                                         op_type),
+            dtype=var.dtype)
+        target_block.append_op(
+            type=op_type,
+            inputs=inputs,
+            outputs={'Out': [slice_out_var]},
+            attrs=attrs)
+        out = slice_out_var
+
+    if len(reverse_axis) > 0:
+        from .layers.tensor import reverse
+        out = reverse(out, axis=reverse_axis)
+
+    return out
+
+
+def _setitem_impl_(var, item, value):
+    from .framework import default_main_program, Variable
+
+    inputs = {'Input': var}
+
+    # 1. Parse item
+    if not isinstance(item, tuple):
+        item = (item, )
+
+    decrease_axes = []
+    axes = []
+    starts = []
+    ends = []
+    steps = []
+
+    item = replace_ellipsis(var, item)
+
+    for dim, slice_item in enumerate(item):
+        if is_integer_or_scalar_tensor(slice_item):
+            decrease_axes.append(dim)
+            start = slice_item
+            end = slice_item + 1 if slice_item != -1 else MAX_INTEGER
+            step = 1
+
+        elif isinstance(slice_item, slice):
+            start = slice_item.start
+            end = slice_item.stop
+            step = slice_item.step
+
+            if start is None and end is None and step is None:
+                continue
+
+            step = 1 if step is None else step
+
+            if not isinstance(step, Variable) and step == 0:
+                raise ValueError(
+                    "When assign a value to a paddle.Tensor, step can not be 0, "
+                    "but received step is {}.".format(step))
+
+            if isinstance(step, Variable) and (start is None or end is None):
+                raise ValueError(
+                    "When assign a value to a paddle.Tensor, it's not supported that "
+                    "the start or end is None when the type of step is paddle.Tensor."
+                )
+
+            if start is None:
+                start = 0 if step > 0 else MAX_INTEGER
+
+            if end is None:
+                end = MAX_INTEGER if step > 0 else (0 - MAX_INTEGER)
+        else:
+            raise IndexError(
+                "Valid index accept int or slice or ellipsis, but received {}.".
+                format(slice_item))
+
+        axes.append(dim)
+        starts.append(start)
+        ends.append(end)
+        steps.append(step)
+
+    attrs = {
+        'axes': axes,
+        'starts': starts,
+        'ends': ends,
+        'steps': steps,
+        'decrease_axes': decrease_axes
+    }
+
+    from .layers import utils
+    if utils._contain_var(starts):
+        inputs['StartsTensorList'] = utils._convert_to_tensor_list(starts)
+        del attrs['starts']
+    if utils._contain_var(ends):
+        inputs['EndsTensorList'] = utils._convert_to_tensor_list(ends)
+        del attrs['ends']
+    if utils._contain_var(steps):
+        inputs['StepsTensorList'] = utils._convert_to_tensor_list(steps)
+        del attrs['steps']
+
+    # 2. Parse value
+    dtype = var.dtype
+    attrs['dtype'] = dtype
+
+    from .data_feeder import convert_dtype
+    #  2.1 value is an integer of float
+    if isinstance(value, (int, float)):
+        value = np.array([value]).astype(convert_dtype(dtype))
+
+    #  2.2 value is a np.ndarray
+    if isinstance(value, np.ndarray):
+        shape = list(value.shape)
+        if dtype == core.VarDesc.VarType.BOOL:
+            value_name = "bool_values"
+            values = [bool(v) for v in value.flat]
+        elif dtype == core.VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in value.flat]
+        elif dtype == core.VarDesc.VarType.FP64:
+            value_name = "fp64_values"
+            values = [float(v) for v in value.flat]
+        elif dtype == core.VarDesc.VarType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in value.flat]
+        elif dtype == core.VarDesc.VarType.INT64:
+            value_name = "int64_values"
+            values = [int(v) for v in value.flat]
+        else:
+            raise TypeError(
+                "When assign a numpy.ndarray, integer or float to a paddle.Tensor, "
+                "the data type of the paddle.Tensor must be bool, float32, int32 or int64, but "
+                "received %s." % convert_dtype(dtype))
+        attrs[value_name] = values
+        attrs["shape"] = shape
+
+    elif isinstance(value, Variable):
+        inputs["ValueTensor"] = value
+    else:
+        raise TypeError(
+            "Only support to assign an integer, float, numpy.ndarray or "
+            "paddle.Tensor to a paddle.Tensor, but received {}".format(
+                type(value)))
+
+    cur_block = default_main_program().current_block()
+    cur_block.append_op(
+        type="set_value", inputs=inputs, outputs={'Out': var}, attrs=attrs)
+
+    return var
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 493574c5bef47a50c633a95488d33fdd7b7dd29d..1705db50d391a9d507e25c0b8f8aa6081b170923 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -491,12 +491,12 @@ def _save_binary_var(obj, path):
             format(type(obj)))
 
 
-def save(obj, path, protocol=2, **configs):
+def save(obj, path, protocol=4, **configs):
     '''
     Save an object to the specified path.
     
     .. note::
-        Now supports saving ``state_dict`` of Layer/Optimizer, Layer, Tensor and nested structure containing Tensor.
+        Now supports saving ``state_dict`` of Layer/Optimizer, Layer, Tensor and nested structure containing Tensor, Program.
 
     .. note::
         Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file, 
@@ -512,7 +512,7 @@ def save(obj, path, protocol=2, **configs):
         path(str) : The path of the object to be saved. 
           If saved in the current directory, the input path string will be used as the file name. 
         protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
-                                 Default: 2
+                                 Default: 4
         **configs(dict, optional): optional keyword arguments. The following options are currently supported:
           use_binary_format(bool): When the saved object is static graph variable, you can specify ``use_binary_for_var``. 
           If True, save the file in the c++ binary format when saving a single static graph variable; otherwise, save it in pickle format.
@@ -544,7 +544,18 @@ def save(obj, path, protocol=2, **configs):
             # save weight of emb
             paddle.save(emb.weight, "emb.weight.pdtensor")
 
-            # example 2: static graph
+            # example 2: Save multiple state_dict at the same time
+            from paddle import nn
+            from paddle.optimizer import Adam
+
+            layer = paddle.nn.Linear(3, 4)
+            adam = Adam(learning_rate=0.001, parameters=layer.parameters())
+            obj = {'model': layer.state_dict(), 'opt': adam.state_dict(), 'epoch': 100}
+            path = 'example/model.pdparams'
+            paddle.save(obj, path)
+
+
+            # example 3: static graph
             import paddle
             import paddle.static as static
 
@@ -570,6 +581,18 @@ def save(obj, path, protocol=2, **configs):
             # save/load state_dict
             path_state_dict = 'temp/model.pdparams'
             paddle.save(prog.state_dict("param"), path_tensor)
+
+            # example 4: save program
+            import paddle
+
+            paddle.enable_static()
+
+            data = paddle.static.data(
+                name='x_static_save', shape=(None, 224), dtype='float32')
+            y_static = z = paddle.static.nn.fc(data, 10)
+            main_program = paddle.static.default_main_program()
+            path = "example/main_program.pdmodel"
+            paddle.save(main_program, path)
     '''
     # 1. input check
     filename = os.path.basename(path)
@@ -667,7 +690,7 @@ def load(path, **configs):
     Load an object can be used in paddle from specified path.
 
     .. note::
-        Now supports loading ``state_dict`` of Layer/Optimizer, Layer, Tensor and nested structure containing Tensor.
+        Now supports loading ``state_dict`` of Layer/Optimizer, Layer, Tensor and nested structure containing Tensor, Program.
 
     .. note::
         In order to use the model parameters saved by paddle more efficiently, 
@@ -714,8 +737,6 @@ def load(path, **configs):
     Examples:
         .. code-block:: python
 
-            import paddle
-
             # example 1: dynamic graph
             import paddle
             emb = paddle.nn.Embedding(10, 10)
@@ -744,7 +765,19 @@ def load(path, **configs):
             load_weight = paddle.load("emb.weight.pdtensor")
 
 
-            # example 2: static graph
+            # example 2: Load multiple state_dict at the same time
+            from paddle import nn
+            from paddle.optimizer import Adam
+
+            layer = paddle.nn.Linear(3, 4)
+            adam = Adam(learning_rate=0.001, parameters=layer.parameters())
+            obj = {'model': layer.state_dict(), 'opt': adam.state_dict(), 'epoch': 100}
+            path = 'example/model.pdparams'
+            paddle.save(obj, path)
+            obj_load = paddle.load(path)
+
+
+            # example 3: static graph
             import paddle
             import paddle.static as static
 
@@ -773,6 +806,22 @@ def load(path, **configs):
             paddle.save(prog.state_dict("param"), path_tensor)
             load_state_dict = paddle.load(path_tensor)
 
+
+            # example 4: load program
+            import paddle
+
+            paddle.enable_static()
+
+            data = paddle.static.data(
+                name='x_static_save', shape=(None, 224), dtype='float32')
+            y_static = z = paddle.static.nn.fc(data, 10)
+            main_program = paddle.static.default_main_program()
+            path = "example/main_program.pdmodel"
+            paddle.save(main_program, path)
+            load_main = paddle.load(path)
+            print(load_main)
+
+
     '''
 
     if os.path.isfile(path):
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 2de065d62a4f8ce75dd079b762d5b899bcbd4d26..eecea3034a752ec0203764219fbdcd8c671c02cf 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -199,7 +199,7 @@ class Conv1D(_ConvNd):
     * :math:`X`: Input value, a ``Tensor`` with 'NCL' format or 'NLC' format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCK] .
     * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`b`: Bias value, a 1-D ``Tensor`` with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
@@ -259,11 +259,15 @@ class Conv1D(_ConvNd):
             is not set, the bias is initialized zero. Default: None.
 
     Attribute:
+
         **weight** (Parameter): the learnable weights of filter of this layer.
+
         **bias** (Parameter or None): the learnable bias of this layer.
 
     Shape:
         - x: 3-D tensor with shape: (batch, in_channels, length) or (batch, length, in_channels).
+        - weight: 3-D tensor with shape: (out_channels, in_channels, kernel_size)
+        - bias: 1-D tensor with shape: (out_channels)
         - output: 3-D tensor with same shape as input x.
     
     Raises:
@@ -444,6 +448,8 @@ class Conv1DTranspose(_ConvNd):
     Shape:
 
         - x(Tensor): 3-D tensor with shape (batch, in_channels, length) when data_format is "NCL" or shape (batch, length, in_channels) when data_format is "NLC".
+        - weight(Tensor): 3-D tensor with shape (in_channels, out_channels, kernel_length).
+        - bias(Tensor): 1-D tensor with shape (out_channels).
         - output_size(int|tuple|list, optional): The output image size. If output size is a tuple/list, it must contain one integer, (feature_length). None if use kernel_size, padding, output_padding and stride to calculate output_size. If output_size and kernel_size are specified at the same time, They should follow the formula above. Default: None. output_size and kernel_size should not be None at the same time.
         - output(Tensor): 3-D tensor with same shape as input x.
 
@@ -540,7 +546,7 @@ class Conv2D(_ConvNd):
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`b`: Bias value, a 1-D ``Tensor`` with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
     
@@ -590,6 +596,10 @@ class Conv2D(_ConvNd):
 
         - x: :math:`(N, C_{in}, H_{in}, W_{in})`
 
+        - weight: :math:`(C_{out}, C_{in}, K_{h}, K_{w})`
+
+        - bias: :math:`(C_{out})`
+
         - output: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
@@ -676,15 +686,15 @@ class Conv2DTranspose(_ConvNd):
     filter, and dilations, strides, paddings. Input and output
     are in NCHW format. Where N is batch size, C is the number of feature map,
     H is the height of the feature map, and W is the width of the feature map.
-    Filter's shape is [MCHW] , where M is the number of input feature map,
-    C is the number of output feature map, H is the height of the filter,
+    Filter's shape is [CMHW] , where C is the number of input feature map,
+    M is the number of output feature map, H is the height of the filter,
     and W is the width of the filter. If the groups is greater than 1,
     C will equal the number of input feature map divided by the groups.
     If bias attribution and activation type are provided, bias is added to
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
     The details of convolution transpose layer, please refer to the following explanation and references
-    `conv2dtranspose <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_ .
+    `conv2dtranspose <https://arxiv.org/pdf/1603.07285.pdf>`_ .
     For each input :math:`X`, the equation is:
 
     ..  math::
@@ -694,9 +704,9 @@ class Conv2DTranspose(_ConvNd):
     Where:
 
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
-    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
+    * :math:`W`: Filter value, a ``Tensor`` with shape [CMHW] .
     * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`b`: Bias value, a 1-D ``Tensor`` with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
     
@@ -749,6 +759,10 @@ class Conv2DTranspose(_ConvNd):
 
         - x: :math:`(N, C_{in}, H_{in}, W_{in})`
 
+        - weight: :math:`(C_{in}, C_{out}, K_{h}, K_{w})`
+
+        - bias: :math:`(C_{out})`
+
         - output: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
@@ -851,7 +865,7 @@ class Conv3D(_ConvNd):
     * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`b`: Bias value, a 1-D tensor with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
@@ -901,6 +915,10 @@ class Conv3D(_ConvNd):
 
         - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
 
+        - weight: :math:`(C_{out}, C_{in}, K_{d}, K_{h}, K_{w})`
+
+        - bias: :math:`(C_{out})`
+
         - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
 
         Where
@@ -995,7 +1013,7 @@ class Conv3DTranspose(_ConvNd):
     is the width of the feature. Parameters(dilations, strides, paddings) are
     two elements. These two elements represent height and width, respectively.
     The details of convolution transpose layer, please refer to the following
-    explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    explanation and references `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
     If bias attribution and activation type are provided, bias is added to
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
@@ -1008,9 +1026,9 @@ class Conv3DTranspose(_ConvNd):
     In the above equation:
 
     * :math:`X`: Input value, a tensor with NCDHW format.
-    * :math:`W`: Filter value, a tensor with MCDHW format.
+    * :math:`W`: Filter value, a tensor with CMDHW format.
     * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`b`: Bias value, a 1-D tensor with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
@@ -1077,6 +1095,10 @@ class Conv3DTranspose(_ConvNd):
 
         - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
 
+        - weight: :math:`(C_{in}, C_{out}, K_{d}, K_{h}, K_{w})`
+
+        - bias: :math:`(C_{out})`
+
         - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
 
         Where
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 361c0e80f90d7dd27d7e5bfc11813d476aa12697..e1012e7656a3d3c00913403b886f1848ac6c3ce2 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -118,6 +118,16 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
         place = _current_expected_place()
 
     if not isinstance(data, np.ndarray):
+
+        def _handle_diff_place_dtype(data, dtype, place, stop_gradient):
+            data.stop_gradient = stop_gradient
+            if not data.place._equals(place):
+                data = data._copy_to(place, False)
+            if dtype:
+                if convert_dtype(dtype) != convert_dtype(data.dtype):
+                    return data.astype(convert_dtype(dtype))
+            return data
+
         if np.isscalar(data) and not isinstance(data, str):
             data = np.array([data])
         elif isinstance(data, (list, tuple)):
@@ -128,13 +138,11 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
                     "this means the input data contains nested lists with different lengths. "
                 )
         elif isinstance(data, paddle.Tensor):
-            data.stop_gradient = stop_gradient
-            if not data.place._equals(place):
-                data = data._copy_to(place, False)
-            if dtype:
-                if convert_dtype(dtype) != convert_dtype(data.dtype):
-                    return data.astype(convert_dtype(dtype))
-            return data
+            return _handle_diff_place_dtype(data, dtype, place, stop_gradient)
+        elif isinstance(data, (core.Tensor, core.LoDTensor)):
+            # convert LoDTensor to VarBase first, and then process it as input VarBase
+            data = paddle.Tensor(data)
+            return _handle_diff_place_dtype(data, dtype, place, stop_gradient)
         else:
             raise TypeError(
                 "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor".
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 97826f7d5f81d9da8df2c97833f1dcd84a11923a..67e6c7f8e44d740f179961c7e183efdced9ff805 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -80,7 +80,7 @@ def concat(x, axis=0, name=None):
 
     Args:
         x(list|tuple): ``x`` is a Tensor list or Tensor tuple which is with data type bool, float16,
-            float32, float64, int32, int64. All the Tensors in ``x`` must have same data type.
+            float32, float64, int32, int64, uint8. All the Tensors in ``x`` must have same data type.
         axis(int|Tensor, optional): Specify the axis to operate on the input Tensors.
             It's a scalar with data type int or a Tensor with shape [1] and data type int32 
             or int64. The effective range is [-R, R), where R is Rank(x). When ``axis < 0``,
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index ea46ea8b39195aeed59f1f2b8ce8048a86fa0aab..104d979ef6785826277ec61756bc75c617be7e2c 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -469,10 +469,6 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
         ###########################   -- END --    ###########################
 
         add_compile_flag(extra_compile_args, ['-w'])  # disable warning
-        # Note(Aurelius84): This marco will impact memory layout of `Tensor`.
-        # We align it automatically with pre-installed Paddle.
-        if core.is_compiled_with_mkldnn():
-            add_compile_flag(extra_compile_args, ['-DPADDLE_WITH_MKLDNN'])
 
         if use_cuda:
             extra_link_args.append('-lcudart')
diff --git a/python/paddle/vision/datasets/flowers.py b/python/paddle/vision/datasets/flowers.py
index 448d6efb52beca953de7981312e8f9131e6fb05d..65c0b604efd5d719cf9df313592b6e3561b5958a 100644
--- a/python/paddle/vision/datasets/flowers.py
+++ b/python/paddle/vision/datasets/flowers.py
@@ -93,62 +93,44 @@ class Flowers(Dataset):
                 .format(backend))
         self.backend = backend
 
-        self.flag = MODE_FLAG_MAP[mode.lower()]
+        flag = MODE_FLAG_MAP[mode.lower()]
 
-        self.data_file = data_file
-        if self.data_file is None:
+        if not data_file:
             assert download, "data_file is not set and downloading automatically is disabled"
-            self.data_file = _check_exists_and_download(
+            data_file = _check_exists_and_download(
                 data_file, DATA_URL, DATA_MD5, 'flowers', download)
 
-        self.label_file = label_file
-        if self.label_file is None:
+        if not label_file:
             assert download, "label_file is not set and downloading automatically is disabled"
-            self.label_file = _check_exists_and_download(
+            label_file = _check_exists_and_download(
                 label_file, LABEL_URL, LABEL_MD5, 'flowers', download)
 
-        self.setid_file = setid_file
-        if self.setid_file is None:
+        if not setid_file:
             assert download, "setid_file is not set and downloading automatically is disabled"
-            self.setid_file = _check_exists_and_download(
+            setid_file = _check_exists_and_download(
                 setid_file, SETID_URL, SETID_MD5, 'flowers', download)
 
         self.transform = transform
 
-        # read dataset into memory
-        self._load_anno()
-
-        self.dtype = paddle.get_default_dtype()
-
-    def _load_anno(self):
-        self.name2mem = {}
-        self.data_tar = tarfile.open(self.data_file)
-        for ele in self.data_tar.getmembers():
-            self.name2mem[ele.name] = ele
+        data_tar = tarfile.open(data_file)
+        self.data_path = data_file.replace(".tgz", "/")
+        if not os.path.exists(self.data_path):
+            os.mkdir(self.data_path)
+        data_tar.extractall(self.data_path)
 
         scio = try_import('scipy.io')
-
-        # double check data download
-        self.label_file = _check_exists_and_download(self.label_file, LABEL_URL,
-                                                     LABEL_MD5, 'flowers', True)
-
-        self.setid_file = _check_exists_and_download(self.setid_file, SETID_URL,
-                                                     SETID_MD5, 'flowers', True)
-
-        self.labels = scio.loadmat(self.label_file)['labels'][0]
-        self.indexes = scio.loadmat(self.setid_file)[self.flag][0]
+        self.labels = scio.loadmat(label_file)['labels'][0]
+        self.indexes = scio.loadmat(setid_file)[flag][0]
 
     def __getitem__(self, idx):
         index = self.indexes[idx]
         label = np.array([self.labels[index - 1]])
         img_name = "jpg/image_%05d.jpg" % index
-        img_ele = self.name2mem[img_name]
-        image = self.data_tar.extractfile(img_ele).read()
-
+        image = os.path.join(self.data_path, img_name)
         if self.backend == 'pil':
-            image = Image.open(io.BytesIO(image))
+            image = Image.open(image)
         elif self.backend == 'cv2':
-            image = np.array(Image.open(io.BytesIO(image)))
+            image = np.array(Image.open(image))
 
         if self.transform is not None:
             image = self.transform(image)
@@ -156,7 +138,7 @@ class Flowers(Dataset):
         if self.backend == 'pil':
             return image, label.astype('int64')
 
-        return image.astype(self.dtype), label.astype('int64')
+        return image.astype(paddle.get_default_dtype()), label.astype('int64')
 
     def __len__(self):
         return len(self.indexes)
diff --git a/python/setup.py.in b/python/setup.py.in
index 0f2e97192c1df1ab7a610dca4c2ceadfcb54ba40..79c67182f9c7911aaae32dfaf660c49dbe683e1c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -185,6 +185,7 @@ packages=['paddle',
           'paddle.fluid.contrib.mixed_precision',
           'paddle.fluid.contrib.mixed_precision.bf16',
           'paddle.fluid.contrib.layers',
+          'paddle.fluid.contrib.sparsity',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details',
           'paddle.fluid.incubate',
diff --git a/scripts/paddle b/scripts/paddle
deleted file mode 100644
index 5f256ccf157910fa0c15447da5a5c65e23c0c223..0000000000000000000000000000000000000000
--- a/scripts/paddle
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/bin/bash
-
-function version(){
-        echo "PaddlePaddle , compiled with"
-        echo "    with_avx: ON"
-        echo "    with_gpu: OFF"
-        echo "    with_mkl: ON"
-        echo "    with_mkldnn: "
-        echo "    with_python: ON"
-}
-
-function ver2num() {
-  set -e
-  # convert version to number.
-  if [ -z "$1" ]; then # empty argument
-    printf "%03d%03d%03d%03d%03d" 0
-  else
-    local VERN=$(echo $1 | sed 's#v##g' | sed 's#\.# #g' \
-        | sed 's#a# 0 #g' | sed 's#b# 1 #g' | sed 's#rc# 2 #g')
-    if [ `echo $VERN | wc -w` -eq 3 ] ; then
-      printf "%03d%03d%03d%03d%03d" $VERN 999 999
-    else
-      printf "%03d%03d%03d%03d%03d" $VERN
-    fi
-  fi
-  set +e
-}
-
-function cpu_config() {
-  # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
-  # only when MKL enabled
-  if [ "ON" == "OFF" ]; then
-    return 0
-  fi
-  platform="`uname -s`"
-  ht=0
-  if [ $platform == "Linux" ]; then
-    ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
-  elif [ $platform == "Darwin" ]; then
-    if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then
-      # HT is OFF
-      ht=1
-    fi
-  else
-    return 0
-  fi
-  if [ $ht -eq 1 ]; then # HT is OFF
-    if [ -z "$KMP_AFFINITY" ]; then
-      export KMP_AFFINITY="granularity=fine,compact,0,0"
-    fi
-    if [ -z "$OMP_DYNAMIC" ]; then
-      export OMP_DYNAMIC="FALSE"
-    fi
-  else # HT is ON
-    if [ -z "$KMP_AFFINITY" ]; then
-      export KMP_AFFINITY="granularity=fine,compact,1,0"
-    fi
-    if [ -z "$OMP_DYNAMIC" ]; then
-      export OMP_DYNAMIC="True"
-    fi
-  fi
-}
-
-function threads_config() {
-  # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
-  # according to trainer_count and total processors
-  # only when MKL enabled
-  # auto set OPENBLAS_NUM_THREADS when do not use MKL
-  platform="`uname -s`"
-  processors=0
-  if [ $platform == "Linux" ]; then
-    processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
-  elif [ $platform == "Darwin" ]; then
-    processors=`sysctl -n hw.logicalcpu`
-  else
-    return 0
-  fi
-  trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
-  if [ -z $trainers ]; then
-    trainers=1
-  fi
-  threads=$((processors / trainers))
-  if [ $threads -eq 0 ]; then
-    threads=1
-  fi
-  if [ "ON" == "ON" ]; then
-    if [ -z "$OMP_NUM_THREADS" ]; then
-      export OMP_NUM_THREADS=$threads
-    fi
-    if [ -z "$MKL_NUM_THREADS" ]; then
-      export MKL_NUM_THREADS=$threads
-    fi
-  else
-    if [ -z "$OPENBLAS_NUM_THREADS" ]; then
-      export OPENBLAS_NUM_THREADS=$threads
-    fi
-    if [ $threads -gt 1 ] && [ -z "$OPENBLAS_MAIN_FREE" ]; then
-      export OPENBLAS_MAIN_FREE=1
-    fi
-  fi
-  
-}
-
-PADDLE_CONF_HOME="$HOME/.config/paddle"
-mkdir -p ${PADDLE_CONF_HOME}
-
-if [ -z "${PADDLE_NO_STAT+x}" ]; then
-    SERVER_VER=`curl -m 5 -X POST --data content="{ \"version\": \"\" }"\
-        -b ${PADDLE_CONF_HOME}/paddle.cookie \
-        -c ${PADDLE_CONF_HOME}/paddle.cookie \
-        http://api.paddlepaddle.org/version 2>/dev/null`
-    if [ $? -eq 0 ] && [ "$(ver2num )" -lt  $(ver2num $SERVER_VER) ]; then
-      echo "Paddle release a new version ${SERVER_VER}, you can get the install package in http://www.paddlepaddle.org"
-    fi
-fi
-
-PADDLE_BIN_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-
-if [ ! -z "${DEBUGGER}" ]; then
-    echo "Using debug command ${DEBUGGER}"
-fi
-
-CUDNN_LIB_PATH=""
-
-if [ ! -z "${CUDNN_LIB_PATH}" ]; then
-    export LD_LIBRARY_PATH=${CUDNN_LIB_PATH}:${LD_LIBRARY_PATH}
-fi
-
-export PYTHONPATH=${PWD}:${PYTHONPATH}
-
-
-# Check python lib installed or not.
-pip --help > /dev/null
-if [ $? -ne 0 ]; then
-    echo "pip should be installed to run paddle."
-    exit 1
-fi
-
-if [ "OFF" == "ON" ]; then
-    PADDLE_NAME="paddlepaddle-gpu"
-else 
-    PADDLE_NAME="paddlepaddle"
-fi
-
-INSTALLED_VERSION=`pip freeze 2>/dev/null | grep "^${PADDLE_NAME}==" | sed 's/.*==//g'`
-
-if [ -z "${INSTALLED_VERSION}" ]; then
-   INSTALLED_VERSION="0.0.0"  # not installed
-fi
-cat <<EOF | python -
-from distutils.version import LooseVersion
-import sys
-if LooseVersion("${INSTALLED_VERSION}") < LooseVersion(""):
-  sys.exit(1)
-else:
-  sys.exit(0)
-EOF
-
-cpu_config
-# echo $KMP_AFFINITY $OMP_DYNAMIC
-
-case "$1" in
-    "version")
-        version
-        ;;
-    *)
-        version
-        ;;
- esac
diff --git a/tools/analysisPyXml.py b/tools/analysisPyXml.py
new file mode 100644
index 0000000000000000000000000000000000000000..db3d6887853f4e981f0ba8886de3d300480c4ba5
--- /dev/null
+++ b/tools/analysisPyXml.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import commands
+from xml.etree import ElementTree
+import re
+import time
+import queue
+import threading
+import os
+import json
+import sys
+
+
+def analysisPyXml(rootPath, ut):
+    xml_path = '%s/build/pytest/%s/python-coverage.xml' % (rootPath, ut)
+    ut_map_file = '%s/build/ut_map/%s/%s.txt' % (rootPath, ut, ut)
+    tree = ElementTree.parse(xml_path)
+    root = tree.getroot()
+    error_files = []
+    pyCov_file = []
+    for clazz in root.findall('packages/package/classes/class'):
+        clazz_filename = clazz.attrib.get('filename')
+        if not clazz_filename.startswith('/paddle'):
+            clazz_filename = '/paddle/%s' % clazz_filename
+        for line in clazz.findall('lines/line'):
+            line_hits = int(line.attrib.get('hits'))
+            if line_hits != 0:
+                line_number = int(line.attrib.get('number'))
+                command = 'sed -n %sp %s' % (line_number, clazz_filename)
+                _code, output = commands.getstatusoutput(command)
+                if _code == 0:
+                    if output.strip().startswith(
+                        ('from', 'import', '__all__', 'def', 'class', '"""',
+                         '@', '\'\'\'', 'logger', '_logger', 'logging', 'r"""',
+                         'pass', 'try', 'except', 'if __name__ == "__main__"'
+                         )) == False:
+                        #print(line_hits, line_number)
+                        pattern = "(.*) = ('*')|(.*) = (\"*\")|(.*) = (\d)|(.*) = (-\d)|(.*) = (None)|(.*) = (True)|(.*) = (False)|(.*) = (URL_PREFIX*)|(.*) = (\[)|(.*) = (\{)|(.*) = (\()"  #a='b'/a="b"/a=0
+                        if re.match(pattern, output.strip()) == None:
+                            pyCov_file.append(clazz_filename)
+                            os.system('echo %s >> %s' %
+                                      (clazz_filename, ut_map_file))
+                            break
+                else:
+                    error_files.append(clazz_filename)
+                    break
+    print("============len(pyCov_file)")
+    print(len(pyCov_file))
+    print("============error")
+    print(error_files)
+
+
+if __name__ == "__main__":
+    rootPath = sys.argv[1]
+    ut = sys.argv[2]
+    analysisPyXml(rootPath, ut)
diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh
index 7301e9954e910aaa117bec5222461feb3ad6aefa..7457bcb26853709dae8affc98f54dfa7231f1eb5 100644
--- a/tools/check_added_ut.sh
+++ b/tools/check_added_ut.sh
@@ -35,8 +35,8 @@ elif [[ "$SYSTEM" == "Windows_NT" ]];then
     git remote | grep upstream
     if [ $? != 0 ]; then 
         git remote add upstream https://github.com/PaddlePaddle/Paddle.git
-        git fetch upstream develop
     fi
+    git fetch upstream ${BRANCH}
 fi
 CURBRANCH=`git rev-parse --abbrev-ref HEAD`
 echo $CURBRANCH
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index b1395c28878e3a5d52ac081d7926dfbd1863530c..ef9af288fb0a2a26341f01f826efd9bf8d0f01d0 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -52,7 +52,7 @@ API_FILES=("CMakeLists.txt"
            "python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py"
            "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py"
            "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py"
-           "tools/wlist.json"
+           "tools/print_signatures.py"
            "tools/sampcd_processor.py"
            "paddle/scripts/paddle_build.bat"
            "tools/windows/run_unittests.sh"
@@ -80,11 +80,10 @@ function add_failed(){
     echo_list="${echo_list[@]}$1"
 }
 
-function run_test_sampcd_processor() {
+function run_tools_test() {
     CUR_PWD=$(pwd)
     cd ${PADDLE_ROOT}/tools
-    python test_sampcd_processor.py
-    python test_print_signatures.py
+    python $1
     cd ${CUR_PWD}
 }
 
@@ -141,12 +140,12 @@ for API_FILE in ${API_FILES[*]}; do
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py" ];then
           echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n"
           check_approval 1 39303645 6836917 43953930
-      elif [ "${API_FILE}" == "tools/wlist.json" ];then
-          echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n"
-          check_approval 1 29231
       elif [ "${API_FILE}" == "tools/sampcd_processor.py" ];then
           echo_line="test_sampcd_processor.py will be executed for changed sampcd_processor.py.\n"
-          run_test_sampcd_processor
+          run_tools_test test_sampcd_processor.py
+      elif [ "${API_FILE}" == "tools/print_signatures.py" ];then
+          echo_line="test_print_signatures.py will be executed for changed print_signatures.py.\n"
+          run_tools_test test_print_signatures.py
       elif [ "${API_FILE}" == "python/paddle/distributed/fleet/__init__.py" ]; then
 	      echo_line="You must have (fuyinno4 (Recommend), raindrops2sea) approval for ${API_FILE} changes"
 	      check_approval 1 35824027 38231817
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 001f380049f92d1bb667d85b49841bc006c8517f..0df3b4914f5df972aab26768bc4d18b1ffc00163 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -130,7 +130,10 @@ class PRChecker(object):
             if not files:
                 break
             for f in files:
-                file_list.append(PADDLE_ROOT + f.filename)
+                if f.status == 'removed':
+                    file_list.append('removed')
+                else:
+                    file_list.append(PADDLE_ROOT + f.filename)
             page += 1
         return file_list
 
@@ -228,6 +231,15 @@ class PRChecker(object):
         print('PREC {} is only comment'.format(f))
         return True
 
+    def get_all_count(self):
+        os.system(
+            "cd %sbuild && ctest -N|grep 'Total Tests:' | awk -F ': ' '{print $2}' > testCount"
+            % PADDLE_ROOT)
+        f = open("%sbuild/testCount" % PADDLE_ROOT)
+        testCount = f.read()
+        f.close()
+        return int(testCount.strip())
+
     def get_pr_ut(self):
         """ Get unit tests in pull request. """
         if self.full_case:
@@ -236,77 +248,89 @@ class PRChecker(object):
         ut_list = []
         file_ut_map = None
         ret = self.__urlretrieve(
-            'https://sys-p0.bj.bcebos.com/prec/file_ut.json{}'.format(
-                self.suffix), 'file_ut.json{}'.format(self.suffix))
+            'https://paddle-docker-tar.bj.bcebos.com/pre_test/ut_file_map.json',
+            'ut_file_map.json')
         if not ret:
             print('PREC download file_ut.json failed')
             exit(1)
-        with open('file_ut.json' + self.suffix) as jsonfile:
+        with open('ut_file_map.json') as jsonfile:
             file_ut_map = json.load(jsonfile)
-        for f in self.get_pr_files():
-            current_system = platform.system()
-            if current_system == "Darwin" or current_system == "Windows":
-                f_judge = f.replace(PADDLE_ROOT, '/paddle/', 1)
-                f_judge = f_judge.replace('//', '/')
-            else:
-                f_judge = f
-            if f_judge not in file_ut_map:
-                if f.endswith('.md'):
-                    ut_list.append('md_placeholder')
-                elif f.endswith('.h') or f.endswith('.cu'):
-                    if self.is_only_comment(f):
-                        ut_list.append('h_cu_comment_placeholder')
-                    else:
-                        print(
-                            'PREC dismatch: {} not in file ut map and not md or comment'.
-                            format(f))
-                        return ''
-                elif f.endswith('.cc') or f.endswith('.py') or f.endswith(
-                        '.cu'):
-                    if f.find('test_') != -1 or f.find('_test') != -1:
-                        print('PREC {} need check new ut'.format(f))
-                        check_added_ut = True
-                    elif self.is_only_comment(f):
-                        ut_list.append('nomap_comment_placeholder')
-                    else:
-                        print(
-                            'PREC dismatch: {} not in file ut map and not new ut or comment'.
-                            format(f))
-                        return ''
+
+        current_system = platform.system()
+        notHitMapFiles = []
+        hitMapFiles = []
+        onlyCommentsFilesOrXpu = []
+        file_list = self.get_pr_files()
+        if 'removed' in file_list:
+            print("ipipe_log_param_PRECISION_TEST: false")
+            print("notHitMapFiles: [rm file]")
+            return ''
+        else:
+            for f in file_list:
+                if current_system == "Darwin" or current_system == "Windows" or self.suffix == ".py3":
+                    f_judge = f.replace(PADDLE_ROOT, '/paddle/', 1)
+                    f_judge = f_judge.replace('//', '/')
                 else:
-                    print('PREC dismatch: {} not in file ut map'.format(f))
-                    return ''
-            else:
-                if self.is_only_comment(f):
-                    ut_list.append('map_comment_placeholder')
+                    f_judge = f
+                if f_judge not in file_ut_map:
+                    if f_judge.endswith('.md'):
+                        ut_list.append('md_placeholder')
+                        onlyCommentsFilesOrXpu.append(f_judge)
+                    elif 'tests/unittests/xpu' in f_judge or 'tests/unittests/npu' in f_judge:
+                        ut_list.append('xpu_npu_placeholder')
+                        onlyCommentsFilesOrXpu.append(f_judge)
+                    elif f_judge.endswith(('.h', '.cu', '.cc', 'py')):
+                        if f_judge.find('test_') != -1 or f_judge.find(
+                                '_test') != -1:
+                            check_added_ut = True
+                        if self.is_only_comment(f):
+                            ut_list.append('comment_placeholder')
+                            onlyCommentsFilesOrXpu.append(f_judge)
+                        else:
+                            notHitMapFiles.append(f_judge)
+                    else:
+                        notHitMapFiles.append(f_judge)
                 else:
-                    ut_list.extend(file_ut_map.get(f_judge))
-        ut_list = list(set(ut_list))
-
-        if check_added_ut:
-            with open('{}/added_ut'.format(PADDLE_ROOT)) as utfile:
-                for ut in utfile:
-                    print('PREC NEW UT: {}'.format(ut.rstrip('\r\n')))
-                    ut_list.append(ut.rstrip('\r\n'))
-
-        if ut_list:
-            ret = self.__urlretrieve(
-                'https://sys-p0.bj.bcebos.com/prec/prec_delta{}'.format(
-                    self.suffix), 'prec_delta{}'.format(self.suffix))
-            if ret:
-                with open('prec_delta' + self.suffix) as delta:
-                    for ut in delta:
-                        ut_list.append(ut.rstrip('\r\n'))
+                    if self.is_only_comment(f):
+                        ut_list.append('comment_placeholder')
+                        onlyCommentsFilesOrXpu.append(f_judge)
+                    else:
+                        hitMapFiles.append(f_judge)
+                        ut_list.extend(file_ut_map.get(f_judge))
+            ut_list = list(set(ut_list))
+            if len(notHitMapFiles) != 0:
+                print("ipipe_log_param_PRECISION_TEST: false")
+                print("notHitMapFiles: %s" % notHitMapFiles)
+                return ''
             else:
-                print('PREC download prec_delta failed')
-                exit(1)
-
-        return '\n'.join(ut_list)
+                if check_added_ut:
+                    with open('{}/added_ut'.format(PADDLE_ROOT)) as utfile:
+                        for ut in utfile:
+                            ut_list.append(ut.rstrip('\r\n'))
+                if ut_list:
+                    ret = self.__urlretrieve(
+                        'https://paddle-docker-tar.bj.bcebos.com/pre_test/prec_delta',
+                        'prec_delta')
+                    if ret:
+                        with open('prec_delta') as delta:
+                            for ut in delta:
+                                ut_list.append(ut.rstrip('\r\n'))
+                    else:
+                        print('PREC download prec_delta failed')
+                        exit(1)
+                    print("ipipe_log_param_PRECISION_TEST: true")
+                    print("ipipe_log_param_PRECISION_TEST_Cases_count: %s" %
+                          len(ut_list))
+                    PRECISION_TEST_Cases_ratio = format(
+                        float(len(ut_list)) / float(self.get_all_count()),
+                        '.2f')
+                    print("ipipe_log_param_PRECISION_TEST_Cases_ratio: %s" %
+                          PRECISION_TEST_Cases_ratio)
+                return '\n'.join(ut_list)
 
 
 if __name__ == '__main__':
     pr_checker = PRChecker()
     pr_checker.init()
-    #print(pr_checker.get_pr_ut())
     with open('ut_list', 'w') as f:
         f.write(pr_checker.get_pr_ut())
diff --git a/tools/get_single_test_cov.py b/tools/get_single_test_cov.py
new file mode 100644
index 0000000000000000000000000000000000000000..42940386ca077398b4d5c7fe310f33fd9068f2d3
--- /dev/null
+++ b/tools/get_single_test_cov.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import time
+import sys
+import re
+
+
+def getFNDAFile(rootPath, test):
+    filename = '%s/build/ut_map/%s/coverage.info.tmp' % (rootPath, test)
+    fn_filename = '%s/build/ut_map/%s/fnda.tmp' % (rootPath, test)
+    os.system('touch %s' % fn_filename)
+    f = open(filename)
+    lines = f.readlines()
+    for line in lines:
+        line = line.replace('\n', '')
+        if line.startswith(('SF:')):
+            os.system('echo %s >> %s' % (line, fn_filename))
+        elif line.startswith(('FNDA:')):
+            hit = int(line.split('FNDA:')[1].split(',')[0])
+            if hit != 0:
+                os.system('echo %s >> %s' % (line, fn_filename))
+    f.close()
+
+
+def analysisFNDAFile(rootPath, test):
+    ut_map_file = '%s/build/ut_map/%s/%s.txt' % (rootPath, test, test)
+    os.system('touch %s' % ut_map_file)
+    fn_filename = '%s/build/ut_map/%s/fnda.tmp' % (rootPath, test)
+    f = open(fn_filename)
+    data = f.read().split('SF:')
+    for message in data:
+        if 'FNDA:' in message:
+            message_list = message.split('\n')
+            clazz_filename = message_list[0]
+            if not clazz_filename.endswith('.h'):  #filter .h's Analysis
+                for i in range(1, len(message_list) - 1):
+                    fn = message_list[i]
+                    matchObj = re.match(
+                        r'(.*)Maker(.*)|(.*)Touch(.*)Regist(.*)|(.*)Touch(.*)JitKernel(.*)|(.*)converterC2Ev(.*)',
+                        fn, re.I)
+                    if matchObj == None:
+                        os.system('echo %s >> %s' %
+                                  (clazz_filename, ut_map_file))
+                        break
+    f.close()
+
+
+def getCovinfo(rootPath, test):
+    ut_map_path = '%s/build/ut_map/%s' % (rootPath, test)
+    os.system(
+        'cd %s && lcov --capture -d . -o coverage.info --rc lcov_branch_coverage=0 > /dev/null 2>&1'
+        % ut_map_path)
+    os.system(
+        "cd %s && lcov --extract coverage.info '/paddle/paddle/fluid/framework/*' '/paddle/paddle/fluid/imperative/*' '/paddle/paddle/fluid/inference/*' '/paddle/paddle/fluid/memory/*' '/paddle/paddle/fluid/operators/*' '/paddle/paddle/fluid/string/*' '/paddle/paddle/fluid/distributed/*' '/paddle/paddle/fluid/extension/*' '/paddle/paddle/fluid/platform/*' '/paddle/paddle/fluid/pybind/*' -o coverage.info.tmp --rc lcov_branch_coverage=0 > /dev/null 2>&1"
+        % ut_map_path)
+    os.system('rm -rf %s/paddle' % ut_map_path)
+    os.system('rm -rf %s/coverage.info' % ut_map_path)
+    getFNDAFile(rootPath, test)
+    analysisFNDAFile(rootPath, test)
+
+
+if __name__ == "__main__":
+    rootPath = sys.argv[1]
+    case = sys.argv[2]
+    getCovinfo(rootPath, case)
diff --git a/tools/get_ut_file_map.py b/tools/get_ut_file_map.py
new file mode 100644
index 0000000000000000000000000000000000000000..d952a299d490e5adb574c05b357aeede587ddec1
--- /dev/null
+++ b/tools/get_ut_file_map.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import re
+import json
+
+
+def get_all_paddle_file(rootPath):
+    """get all file in Paddle repo: paddle/fluild, python"""
+    traverse_files = ['%s/paddle/fluid' % rootPath, '%s/python' % rootPath]
+    all_file_paddle = '%s/build/all_file_paddle' % rootPath
+    all_file_paddle_list = []
+    with open(all_file_paddle, 'w') as f:
+        for filename in traverse_files:
+            g = os.walk(filename)
+            for path, dir_list, file_list in g:
+                for file_name in file_list:
+                    all_file_paddle_list.append(os.path.join(path, file_name))
+    return all_file_paddle_list
+
+
+def get_all_uts(rootPath):
+    all_uts_paddle = '%s/build/all_uts_paddle' % rootPath
+    os.system(
+        'cd %s/build && ctest -N -V | grep -Ei "Test[ \t]+#" | grep -oEi "\w+$" > %s'
+        % (rootPath, all_uts_paddle))
+
+
+def remove_useless_file(rootPath):
+    """remove useless file in ut_file_map.json"""
+    all_file_paddle_list = get_all_paddle_file(rootPath)
+    ut_file_map_new = {}
+    ut_file_map = "%s/build/ut_file_map.json" % rootPath
+    with open(ut_file_map, 'r') as load_f:
+        load_dict = json.load(load_f)
+    for key in load_dict:
+        if key in all_file_paddle_list:
+            ut_file_map_new[key] = load_dict[key]
+
+    with open("%s/build/ut_file_map.json" % rootPath, "w") as f:
+        json.dump(ut_file_map_new, f, indent=4)
+        print("remove_useless_file ut_file_map success!!")
+
+
+def handle_ut_file_map(rootPath):
+    utNotSuccess = ''
+    ut_map_path = "%s/build/ut_map" % rootPath
+    files = os.listdir(ut_map_path)
+    ut_file_map = {}
+    count = 0
+    not_success_file = open("%s/build/prec_delta" % rootPath, 'w')
+    for ut in files:
+        count = count + 1
+        print("ut %s: %s" % (count, ut))
+        coverage_info = '%s/%s/coverage.info.tmp' % (ut_map_path, ut)
+        if os.path.exists(coverage_info):
+            filename = '%s/%s/%s.txt' % (ut_map_path, ut, ut)
+            f = open(filename)
+            lines = f.readlines()
+            for line in lines:
+                line = line.replace('\n', '').strip()
+                if line == '':
+                    continue
+                elif line.startswith('/paddle/build'):
+                    source_file = line.replace('/build', '')
+                    #source_file = re.sub('.pb.*', '.proto', source_file)
+                elif 'precise test map fileeee:' in line:
+                    source_file = line.split('precise test map fileeee:')[
+                        1].strip()
+                else:
+                    source_file = line
+                if source_file not in ut_file_map:
+                    ut_file_map[source_file] = []
+                if ut not in ut_file_map[source_file]:
+                    ut_file_map[source_file].append(ut)
+
+        else:
+            not_success_file.write('%s\n' % ut)
+            utNotSuccess = utNotSuccess + '^%s$|' % ut
+
+    not_success_file.close()
+
+    with open("%s/build/ut_file_map.json" % rootPath, "w") as f:
+        json.dump(ut_file_map, f, indent=4)
+
+    print("utNotSuccess:")
+    print(utNotSuccess)
+
+
+def notsuccessfuc(rootPath):
+    utNotSuccess = ''
+    ut_map_path = "%s/build/ut_map" % rootPath
+    files = os.listdir(ut_map_path)
+    count = 0
+    # ut failed!!
+    for ut in files:
+        coverage_info = '%s/%s/coverage.info.tmp' % (ut_map_path, ut)
+        if os.path.exists(coverage_info):
+            pass
+        else:
+            count = count + 1
+            utNotSuccess = utNotSuccess + '^%s$|' % ut
+
+    # ut not exec
+    get_all_uts(rootPath)
+    with open("/paddle/build/all_uts_paddle", "r") as f:
+        data = f.readlines()
+    for ut in data:
+        ut = ut.replace('\n', '').strip()
+        if ut not in files:
+            print(ut)
+            count = count + 1
+            utNotSuccess = utNotSuccess + '^%s$|' % ut
+
+    if utNotSuccess != '':
+        print("utNotSuccess count: %s" % count)
+        f = open('%s/build/utNotSuccess' % rootPath, 'w')
+        f.write(utNotSuccess[:-1])
+        f.close()
+
+
+def ut_file_map_supplement(rootPath):
+    ut_file_map_new = "%s/build/ut_file_map.json" % rootPath
+    os.system('mkdir /pre_test')
+    os.system(
+        'cd /pre_test && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test/ut_file_map.json --no-check-certificate'
+    )
+    ut_file_map_old = "/pre_test/ut_file_map.json"
+    ut_file_map_full = {}
+    with open(ut_file_map_new, 'r') as load_f:
+        load_dict_new = json.load(load_f)
+    with open(ut_file_map_old, 'r') as f:
+        load_dict_old = json.load(f)
+
+    for filename in load_dict_new:
+        ut_file_map_full[filename] = load_dict_new[filename]
+        if filename in load_dict_old:
+            for ut in load_dict_old[filename]:
+                if ut not in ut_file_map_full[filename]:
+                    ut_file_map_full[filename].append(ut)
+
+    for filename in load_dict_old:
+        if filename not in load_dict_new:
+            ut_file_map_full[filename] = load_dict_old[filename]
+
+    with open("/pre_test/ut_file_map.json", "w") as f:
+        json.dump(ut_file_map_full, f, indent=4)
+        print("ut_file_map_full success!!")
+
+    all_uts_paddle = '%s/build/all_uts_paddle' % rootPath
+    with open(all_uts_paddle, 'r') as f:
+        all_uts_paddle_list = f.readlines()
+        f.close()
+    os.system(
+        'cd /pre_test && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test/prec_delta --no-check-certificate'
+    )
+    prec_delta_old = '/pre_test/prec_delta'
+    prec_delta_new = "%s/build/prec_delta" % rootPath
+    with open(prec_delta_old, 'r') as f:
+        prec_delta_old_list = f.readlines()
+        f.close()
+    with open(prec_delta_new, 'r') as f:
+        prec_delta_new_list = f.readlines()
+        f.close()
+    for ut in prec_delta_old_list:
+        if ut not in prec_delta_new_list and ut not in all_uts_paddle_list:
+            prec_delta_new_list.append(ut)
+    prec_delta_file = open("/pre_test/prec_delta", 'w')
+    for ut in prec_delta_new_list:
+        prec_delta_file.write(ut)
+    prec_delta_file.close()
+
+
+if __name__ == "__main__":
+    func = sys.argv[1]
+    if func == 'get_not_success_ut':
+        rootPath = sys.argv[2]
+        notsuccessfuc(rootPath)
+    elif func == 'get_ut_map':
+        rootPath = sys.argv[2]
+        handle_ut_file_map(rootPath)
+        remove_useless_file(rootPath)
+        ut_file_map_supplement(rootPath)
diff --git a/tools/handle_h_cu_file.py b/tools/handle_h_cu_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c300d96c84618ec6874f65b838542a837ccc4f9
--- /dev/null
+++ b/tools/handle_h_cu_file.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import queue
+import threading
+import os
+import json
+import time
+import sys
+
+taskQueue = queue.Queue()
+
+
+def worker(fun):
+    while True:
+        temp = taskQueue.get()
+        fun(temp)
+        taskQueue.task_done()
+
+
+def threadPool(threadPoolNum):
+    threadPool = []
+    for i in range(threadPoolNum):
+        thread = threading.Thread(target=worker, args={doFun, })
+        thread.daemon = True
+        threadPool.append(thread)
+    return threadPool
+
+
+def get_h_file_md5(rootPath):
+    h_cu_files = '%s/tools/h_cu_files.log' % rootPath
+    f = open(h_cu_files)
+    lines = f.readlines()
+    for line in lines:
+        line = line.strip()
+        os.system('md5sum %s >> %s/tools/h_cu_md5.log' % (line, rootPath))
+
+
+def insert_pile_to_h_file(rootPath):
+    h_cu_files = '%s/tools/h_cu_files.log' % rootPath
+    f = open(h_cu_files)
+    lines = f.readlines()
+    for line in lines:
+        line = line.strip()
+        func = line.replace('/', '_').replace('.', '_')
+        os.system('echo "\n#ifndef _PRECISE%s_\n" >> %s' % (func.upper(), line))
+        os.system('echo "#define _PRECISE%s_" >> %s' % (func.upper(), line))
+        os.system('echo "\n#include <cstdio>\n" >> %s' % line)
+        os.system(
+            'echo "__attribute__((constructor)) static void calledFirst%s()\n{" >> %s'
+            % (func, line))
+        os.system(
+            'echo \'    printf("precise test map fileeee: %%s\\\\n", __FILE__);\n}\' >> %s'
+            % line)
+        os.system('echo "\n#endif" >> %s' % line)
+
+
+def get_h_cu_file(file_path):
+    rootPath = file_path[0]
+    dir_path = file_path[1]
+    filename = file_path[2]
+    ut = filename.replace('^', '').replace('$', '').replace('.log', '')
+    os.system(
+        "cat %s/%s | grep 'precise test map fileeee:'| uniq >> %s/build/ut_map/%s/%s.txt"
+        % (dir_path, filename, rootPath, ut, ut))
+
+
+def doFun(file_path):
+    get_h_cu_file(file_path)
+
+
+def main(rootPath, dir_path):
+    """
+    get useful message
+    """
+    startTime = int(time.time())
+    test_h_cu_dict = {}
+    pool = threadPool(23)
+    for i in range(pool.__len__()):
+        pool[i].start()
+    files = os.listdir(dir_path)
+    for filename in files:
+        file_path = [rootPath, dir_path, filename]
+        taskQueue.put(file_path)
+    taskQueue.join()
+    endTime = int(time.time())
+    print('analy h/cu file cost Time: %s' % (endTime - startTime))
+
+
+if __name__ == "__main__":
+    func = sys.argv[1]
+    if func == 'get_h_file_md5':
+        rootPath = sys.argv[2]
+        get_h_file_md5(rootPath)
+    elif func == 'insert_pile_to_h_file':
+        rootPath = sys.argv[2]
+        insert_pile_to_h_file(rootPath)
+    elif func == 'analy_h_cu_file':
+        dir_path = sys.argv[2]
+        rootPath = sys.argv[3]
+        main(rootPath, dir_path)
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index b36643a11023cfadb9814b0691613851345659e4..55b82084f6bc5acf5e5a0f267255d6c63b0ff4c4 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -284,19 +284,15 @@ CPU_PARALLEL_JOB = [
     'test_default_scope_funcs',
     'test_default_dtype',
     'test_debugger',
-    'test_dataset_wmt',
     'test_dataset_voc',
     'test_dataset_uci_housing',
-    'test_dataset_movielens',
     'test_dataset_imikolov',
     'test_dataset_imdb',
     'test_dataset_conll05',
-    'test_dataset_cifar',
     'test_dataloader_dataset',
     'test_data_generator',
     'test_data_feeder',
     'test_data',
-    'test_cyclic_cifar_dataset',
     'test_cudnn_placement_pass',
     'test_crypto',
     'test_crf_decoding_op',
@@ -335,7 +331,6 @@ CPU_PARALLEL_JOB = [
     'test_broadcast_to_op',
     'test_broadcast_shape',
     'test_broadcast_error',
-    'test_broadcast',
     'test_bpr_loss_op',
     'test_boxps',
     'test_bipartite_match_op',
@@ -346,8 +341,6 @@ CPU_PARALLEL_JOB = [
     'test_basic_rnn_name',
     'test_attention_lstm_op',
     'test_analyzer',
-    'test_allreduce',
-    'test_allgather',
     'test_aligned_allocator',
     'system_allocator_test',
     'stringprintf_test',
@@ -428,6 +421,7 @@ CPU_PARALLEL_JOB = [
     'buffered_allocator_test',
     'broadcast_op_test',
     'bfloat16_test',
+    'complex_test',
     'beam_search_decode_op_test',
     'auto_growth_best_fit_allocator_test',
     'assign_op_test',
@@ -494,7 +488,6 @@ CPU_PARALLEL_JOB = [
     'test_dist_mnist_ring_allreduce',
     'test_fleet_launch_async',
     'test_dist_fleet_a_sync_optimizer_geo',
-    'test_parallel_dygraph_control_flow',
     'test_auto_checkpoint',
     'test_fleet_pipeline_meta_optimizer',
     'test_dist_fleet_heter_ctr',
@@ -516,12 +509,10 @@ CPU_PARALLEL_JOB = [
     'test_dist_fleet_ps2',
     'test_dist_fleet_grad_clip',
     'test_custom_concat',
-    'test_analyzer_transformer_fuse',
     'test_analyzer_seq_pool1_fuse_statis',
     'test_fc_lstm_fuse_pass_cc',
     'test_layer_norm_fuse_pass',
     'test_fc_gru_fuse_pass_cc',
-    'test_analyzer_save_model',
     'test_fleet_ps',
     'test_analyzer_multi_model_prediction',
     'test_fleet_base_3',
@@ -545,7 +536,6 @@ CPU_PARALLEL_JOB = [
     'test_bf16_utils',
     'test_analyzer_seq_pool1_compare_determine',
     'test_avoid_twice_initialization',
-    'test_callback_early_stop',
     'test_fleet_distributed_strategy',
     'test_launch_coverage',
     'test_sgd_op_bf16',
@@ -632,16 +622,11 @@ TETRAD_PARALLEL_JOB = [
     'test_analyzer_seq_pool1',
     'test_analyzer_ocr',
     'test_analyzer_seq_conv1',
-    'test_analyzer_small_dam',
     'test_analyzer_mobilenet_depthwise_conv',
     'test_analyzer_pyramid_dnn',
-    'test_analyzer_text_classification',
     'test_analyzer_rnn2',
-    'test_analyzer_transformer',
     'test_analyzer_resnet50',
     'test_analyzer_ner',
-    'test_analyzer_lac',
-    'test_analyzer_transformer_profile',
     'test_analyzer_mobilenet_transpose',
     'test_analyzer_rnn1',
     'test_analyzer_seq_pool1_profile',
@@ -664,9 +649,7 @@ TETRAD_PARALLEL_JOB = [
     'test_collective_split_embedding_none_divisible',
     'test_collective_wait',
     'test_collective_split_row_linear',
-    'test_collective_split_col_linear',
     'test_collective_split_embedding',
-    'test_custom_attrs_jit',
     'float16_gpu_test',
     'test_leaky_relu_grad_grad_functor',
     'test_complex_simplenet',
@@ -699,11 +682,8 @@ TWO_PARALLEL_JOB = [
     'test_ema',
     'test_nan_inf',
     'test_isinstance',
-    'test_jit_save_load',
     'test_box_clip_op',
-    'test_group_norm_op',
     'test_seed_op',
-    'test_activation_nn_grad',
     'test_pool2d_int8_mkldnn_op',
     'test_adagrad_op_v2',
     'test_nn_functional_hot_op',
@@ -734,7 +714,6 @@ TWO_PARALLEL_JOB = [
     'test_lod_reset_op',
     'test_install_check',
     'test_anchor_generator_op',
-    'test_imperative_ptb_rnn',
     'test_gather_nd_op',
     'test_network_with_dtype',
     'test_elementwise_sub_op',
@@ -823,7 +802,6 @@ TWO_PARALLEL_JOB = [
     'test_sequence_expand_as',
     'test_cos_sim_op',
     'test_sequence_enumerate_op',
-    'test_cross_entropy2_op',
     'test_sequence_concat',
     'test_cudnn_lstmcell',
     'test_data_norm_op',
@@ -928,7 +906,6 @@ TWO_PARALLEL_JOB = [
     'test_crop_tensor_op',
     'test_sequence_expand',
     'test_sequence_mask',
-    'test_conv_nn_grad',
     'test_sequence_pool',
     'test_conv_elementwise_add2_act_fuse_pass',
     'test_sequence_reshape',
@@ -955,7 +932,6 @@ TWO_PARALLEL_JOB = [
     'test_adam_op',
     'test_bilinear_tensor_product_op',
     'test_transpose_mkldnn_op',
-    'test_callback_reduce_lr_on_plateau',
     'test_cast_op',
     'test_scatter_nd_op',
     'test_conv2d_transpose_op_depthwise_conv',
diff --git a/tools/pyCov_multithreading.py b/tools/pyCov_multithreading.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df4ac2ef6b3fb918322d65c2c3b9f60964bb5d6
--- /dev/null
+++ b/tools/pyCov_multithreading.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import commands
+from xml.etree import ElementTree
+import re
+import time
+import queue
+import threading
+import os
+import json
+import sys
+
+taskQueue = queue.Queue()
+lock = threading.RLock()
+
+
+def worker(fun):
+    while True:
+        temp = taskQueue.get()
+        fun(temp)
+        taskQueue.task_done()
+
+
+def threadPool(threadPoolNum):
+    threadPool = []
+    for i in range(threadPoolNum):
+        thread = threading.Thread(target=worker, args={doFun, })
+        thread.daemon = True
+        threadPool.append(thread)
+    return threadPool
+
+
+def getPyCovResult(params):
+    rootPath = params[0]
+    ut = params[1]
+    print("ut: %s" % ut)
+    startTime = int(time.time())
+    path = '%s/build/pytest/%s' % (rootPath, ut)
+    os.system('cd %s && coverage combine `ls python-coverage.data.*`' % path)
+    os.system('cd %s && pwd && coverage xml -i -o python-coverage.xml' % path)
+    xml_path = '%s/python-coverage.xml' % path
+    os.system("python %s/tools/analysisPyXml.py %s %s" %
+              (rootPath, rootPath, ut))
+    endTime = int(time.time())
+    print('pyCov Time: %s' % (endTime - startTime))
+
+
+def doFun(params):
+    getPyCovResult(params)
+
+
+def main(rootPath):
+    """
+    1. get gcov file
+    2. get gcov file not coverageratio = 0
+    """
+    path = '%s/build/pytest' % rootPath
+    dirs = os.listdir(path)
+    pool = threadPool(23)
+    for i in range(pool.__len__()):
+        pool[i].start()
+    for ut in dirs:
+        params = [rootPath, ut]
+        taskQueue.put(params)
+    taskQueue.join()
+
+
+if __name__ == "__main__":
+    rootPath = sys.argv[1]
+    main(rootPath)
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 52777cd59ba253b8801bf058899570b4770ca724..a1658e3c2edf790ba817ae3a098a5d660a94a050 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -11,12 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+please make sure to run in the tools path
+usage: python sample_test.py {cpu or gpu} 
+    {cpu or gpu}: running in cpu version or gpu version
+
+for example, you can run cpu version python2 testing like this:
+
+    python sampcd_processor.py cpu 
 
+"""
 import os
 import sys
 import subprocess
 import multiprocessing
-import math
 import platform
 import inspect
 import json
@@ -24,16 +32,7 @@ import argparse
 import shutil
 import re
 import logging
-"""
-please make sure to run in the tools path
-usage: python sample_test.py {cpu or gpu} 
-    {cpu or gpu}: running in cpu version or gpu version
-
-for example, you can run cpu version python2 testing like this:
-
-    python sampcd_processor.py cpu 
-
-"""
+import time
 
 logger = logging.getLogger()
 if logger.handlers:
@@ -45,6 +44,7 @@ else:
 console.setFormatter(logging.Formatter("%(message)s"))
 
 RUN_ON_DEVICE = 'cpu'
+SAMPLE_CODE_TEST_CAPACITY = set()
 GPU_ID = 0
 methods = []
 whl_error = []
@@ -52,6 +52,15 @@ API_DEV_SPEC_FN = 'paddle/fluid/API_DEV.spec'
 API_PR_SPEC_FN = 'paddle/fluid/API_PR.spec'
 API_DIFF_SPEC_FN = 'dev_pr_diff_api.spec'
 SAMPLECODE_TEMPDIR = 'samplecode_temp'
+ENV_KEY_CODES_FRONTEND = 'CODES_INSERTED_INTO_FRONTEND'
+ENV_KEY_TEST_CAPACITY = 'SAMPLE_CODE_TEST_CAPACITY'
+SUMMARY_INFO = {
+    'success': [],
+    'failed': [],
+    'skiptest': [],
+    'nocodes': [],
+    # ... required not-match
+}
 
 
 def find_all(srcstr, substr):
@@ -75,32 +84,225 @@ def find_all(srcstr, substr):
     return indices
 
 
-def check_indent(cdline):
+def find_last_future_line_end(cbstr):
+    """
+    find the last `__future__` line.
+
+    Args:
+        docstr(str): docstring
+    Return:
+        index of the line end or None.
     """
-    to check the indent of a given code line
+    pat = re.compile('__future__.*\n')
+    lastmo = None
+    it = re.finditer(pat, cbstr)
+    while True:
+        try:
+            lastmo = next(it)
+        except StopIteration:
+            break
+    if lastmo:
+        return lastmo.end()
+    else:
+        return None
 
-    to get the number of starting blank chars,
-    e.t. blankspaces and \t
 
-    \t will be interpreted as 4 single blankspaces,
-    e.t. '\t'='    '
+def extract_code_blocks_from_docstr(docstr):
+    """
+    extract code-blocks from the given docstring.
+
+    DON'T include the multiline-string definition in code-blocks.
+    The *Examples* section must be the last.
 
     Args:
-        cdline(str) : a single line of code from the source file
+        docstr(str): docstring
+    Return:
+        code_blocks: A list of code-blocks, indent removed. 
+                     element {'name': the code-block's name, 'id': sequence id.
+                              'codes': codes, 'required': 'gpu'}
+    """
+    code_blocks = []
+
+    mo = re.search(r"Examples:", docstr)
+    if mo is None:
+        return code_blocks
+    ds_list = docstr[mo.start():].replace("\t", '    ').split("\n")
+    lastlineindex = len(ds_list) - 1
+
+    cb_start_pat = re.compile(r"code-block::\s*python")
+    cb_param_pat = re.compile(r"^\s*:(\w+):\s*(\S*)\s*$")
+    cb_required_pat = re.compile(r"^\s*#\s*require[s|d]\s*:\s*(\S+)\s*$")
+
+    cb_info = {}
+    cb_info['cb_started'] = False
+    cb_info['cb_cur'] = []
+    cb_info['cb_cur_indent'] = -1
+    cb_info['cb_cur_name'] = None
+    cb_info['cb_cur_seq_id'] = 0
+    cb_info['cb_required'] = None
+
+    def _cb_started():
+        # nonlocal cb_started, cb_cur_name, cb_required, cb_cur_seq_id
+        cb_info['cb_started'] = True
+        cb_info['cb_cur_seq_id'] += 1
+        cb_info['cb_cur_name'] = None
+        cb_info['cb_required'] = None
+
+    def _append_code_block():
+        # nonlocal code_blocks, cb_cur, cb_cur_name, cb_cur_seq_id, cb_required
+        code_blocks.append({
+            'codes': inspect.cleandoc("\n".join(cb_info['cb_cur'])),
+            'name': cb_info['cb_cur_name'],
+            'id': cb_info['cb_cur_seq_id'],
+            'required': cb_info['cb_required'],
+        })
+
+    for lineno, linecont in enumerate(ds_list):
+        if re.search(cb_start_pat, linecont):
+            if not cb_info['cb_started']:
+                _cb_started()
+                continue
+            else:
+                # cur block end
+                if len(cb_info['cb_cur']):
+                    _append_code_block()
+                _cb_started()  # another block started
+                cb_info['cb_cur_indent'] = -1
+                cb_info['cb_cur'] = []
+        else:
+            if cb_info['cb_started']:
+                # handle the code-block directive's options
+                mo_p = cb_param_pat.match(linecont)
+                if mo_p:
+                    if mo_p.group(1) == 'name':
+                        cb_info['cb_cur_name'] = mo_p.group(2)
+                    continue
+                # read the required directive
+                mo_r = cb_required_pat.match(linecont)
+                if mo_r:
+                    cb_info['cb_required'] = mo_r.group(1)
+                # docstring end
+                if lineno == lastlineindex:
+                    mo = re.search(r"\S", linecont)
+                    if mo is not None and cb_info['cb_cur_indent'] <= mo.start(
+                    ):
+                        cb_info['cb_cur'].append(linecont)
+                    if len(cb_info['cb_cur']):
+                        _append_code_block()
+                    break
+                # check indent for cur block start and end.
+                mo = re.search(r"\S", linecont)
+                if mo is None:
+                    continue
+                if cb_info['cb_cur_indent'] < 0:
+                    # find the first non empty line
+                    cb_info['cb_cur_indent'] = mo.start()
+                    cb_info['cb_cur'].append(linecont)
+                else:
+                    if cb_info['cb_cur_indent'] <= mo.start():
+                        cb_info['cb_cur'].append(linecont)
+                    else:
+                        if linecont[mo.start()] == '#':
+                            continue
+                        else:
+                            # block end
+                            if len(cb_info['cb_cur']):
+                                _append_code_block()
+                            cb_info['cb_started'] = False
+                            cb_info['cb_cur_indent'] = -1
+                            cb_info['cb_cur'] = []
+    return code_blocks
+
+
+def get_test_capacity():
+    """
+    collect capacities and set to SAMPLE_CODE_TEST_CAPACITY
+    """
+    global SAMPLE_CODE_TEST_CAPACITY  # write
+    global ENV_KEY_TEST_CAPACITY, RUN_ON_DEVICE  # readonly
+    if ENV_KEY_TEST_CAPACITY in os.environ:
+        for r in os.environ[ENV_KEY_TEST_CAPACITY].split(','):
+            rr = r.strip().lower()
+            if r:
+                SAMPLE_CODE_TEST_CAPACITY.add(rr)
+    if 'cpu' not in SAMPLE_CODE_TEST_CAPACITY:
+        SAMPLE_CODE_TEST_CAPACITY.add('cpu')
 
-    Returns:
-        int : the indent of the number of interpreted
-             blankspaces
+    if RUN_ON_DEVICE:
+        SAMPLE_CODE_TEST_CAPACITY.add(RUN_ON_DEVICE)
+
+
+def is_required_match(requirestr, cbtitle='not-specified'):
     """
-    indent = 0
-    for c in cdline:
-        if c == '\t':
-            indent += 4
-        elif c == ' ':
-            indent += 1
-        if c != ' ' and c != '\t':
-            break
-    return indent
+    search the required instruction in the code-block, and check it match the current running environment.
+    
+    environment values of equipped: cpu, gpu, xpu, distributed, skip
+    the 'skip' is the special flag to skip the test, so is_required_match will return False directly.
+
+    Args:
+        requirestr(str): the required string.
+        cbtitle(str): the title of the code-block.
+    returns:
+        True - yes, matched
+        False - not match
+        None - skipped  # trick
+    """
+    global SAMPLE_CODE_TEST_CAPACITY  # readonly
+    requires = set(['cpu'])
+    if requirestr:
+        for r in requirestr.split(','):
+            rr = r.strip().lower()
+            if rr:
+                requires.add(rr)
+    if 'skip' in requires or 'skiptest' in requires:
+        logger.info('%s: skipped', cbtitle)
+        return None
+
+    if all([
+            k in SAMPLE_CODE_TEST_CAPACITY for k in requires
+            if k not in ['skip', 'skiptest']
+    ]):
+        return True
+
+    logger.info('%s: the equipments [%s] not match the required [%s].', cbtitle,
+                ','.join(SAMPLE_CODE_TEST_CAPACITY), ','.join(requires))
+    return False
+
+
+def insert_codes_into_codeblock(codeblock, apiname='not-specified'):
+    """
+    insert some codes in the frontend and backend into the code-block.
+    """
+    global ENV_KEY_CODES_FRONTEND, GPU_ID, RUN_ON_DEVICE  # readonly
+    inserted_codes_f = ''
+    inserted_codes_b = ''
+    if ENV_KEY_CODES_FRONTEND in os.environ and os.environ[
+            ENV_KEY_CODES_FRONTEND]:
+        inserted_codes_f = os.environ[ENV_KEY_CODES_FRONTEND]
+    else:
+        cpu_str = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = ""\n'
+        gpu_str = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = "{}"\n'.format(
+            GPU_ID)
+        if 'required' in codeblock:
+            if codeblock['required'] is None or codeblock['required'] == 'cpu':
+                inserted_codes_f = cpu_str
+            elif codeblock['required'] == 'gpu':
+                inserted_codes_f = gpu_str
+        else:
+            if RUN_ON_DEVICE == "cpu":
+                inserted_codes_f = cpu_str
+            elif RUN_ON_DEVICE == "gpu":
+                inserted_codes_f = gpu_str
+    inserted_codes_b = '\nprint("{}\'s sample code (name:{}, id:{}) is executed successfully!")'.format(
+        apiname, codeblock['name'], codeblock['id'])
+
+    cb = codeblock['codes']
+    last_future_line_end = find_last_future_line_end(cb)
+    if last_future_line_end:
+        return cb[:last_future_line_end] + inserted_codes_f + cb[
+            last_future_line_end:] + inserted_codes_b
+    else:
+        return inserted_codes_f + cb + inserted_codes_b
 
 
 def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
@@ -117,122 +319,111 @@ def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
     Returns:
         sample_code_filenames(list of str)
     """
-    global GPU_ID, RUN_ON_DEVICE, SAMPLECODE_TEMPDIR
-    CODE_BLOCK_INTERDUCTORY = "code-block:: python"
+    global GPU_ID, RUN_ON_DEVICE, SAMPLECODE_TEMPDIR  # readonly
+    global SUMMARY_INFO  # update
 
-    sampcd_begins = find_all(srccom, CODE_BLOCK_INTERDUCTORY)
-    if len(sampcd_begins) == 0:
+    codeblocks = extract_code_blocks_from_docstr(srccom)
+    if len(codeblocks) == 0:
+        SUMMARY_INFO['nocodes'].append(name)
         # detect sample codes using >>> to format and consider this situation as wrong
-        print(htype, " name:", hname)
-        print("-----------------------")
+        logger.info(htype + " name:" + name)
+        logger.info("-----------------------")
         if srccom.find("Examples:") != -1:
-            print("----example code check----\n")
+            logger.info("----example code check----")
             if srccom.find(">>>") != -1:
-                print(
-                    "Deprecated sample code style:\n\n    Examples:\n\n        >>>codeline\n        >>>codeline\n\n\n ",
-                    "Please use '.. code-block:: python' to ",
-                    "format sample code.\n")
+                logger.warning(r"""Deprecated sample code style:
+    Examples:
+        >>>codeline
+        >>>codeline
+
+Please use '.. code-block:: python' to format the sample code.""")
                 return []
         else:
-            print("Error: No sample code!\n")
+            logger.warning("Error: No sample code!")
             return []
+
     sample_code_filenames = []
-    for y in range(1, len(sampcd_begins) + 1):
-        sampcd_begin = sampcd_begins[y - 1]
-        sampcd = srccom[sampcd_begin + len(CODE_BLOCK_INTERDUCTORY) + 1:]
-        sampcd = sampcd.split("\n")
-        # remove starting empty lines
-        while sampcd[0].replace(' ', '').replace('\t', '') == '':
-            sampcd.pop(0)
-
-        # the minimum indent, which is the indent of the first
-        # non-empty line
-        min_indent = check_indent(sampcd[0])
-        sampcd_to_write = []
-        for i in range(0, len(sampcd)):
-            cdline = sampcd[i]
-            # handle empty lines or those only with spaces/tabs
-            if cdline.strip() == '':
-                continue
-            this_indent = check_indent(cdline)
-            if this_indent < min_indent:
-                break
-            else:
-                cdline = cdline.replace('\t', '    ')
-                sampcd_to_write.append(cdline[min_indent:])
-
-        sampcd = '\n'.join(sampcd_to_write)
-        if RUN_ON_DEVICE == "cpu":
-            sampcd = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = ""\n' + sampcd
-        if RUN_ON_DEVICE == "gpu":
-            sampcd = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = "{}"\n'.format(
-                GPU_ID) + sampcd
-        sampcd += '\nprint(' + '\"' + name + ' sample code is executed successfully!\")'
-
-        tfname = os.path.join(SAMPLECODE_TEMPDIR, '{}_example{}'.format(
-            name, '.py' if len(sampcd_begins) == 1 else '_{}.py'.format(y)))
-        with open(tfname, 'w') as tempf:
-            tempf.write(sampcd)
-        sample_code_filenames.append(tfname)
+    for y, cb in enumerate(codeblocks):
+        matched = is_required_match(cb['required'], name)
+        # matched has three states:
+        # True - please execute it;
+        # None - no sample code found;
+        # False - it need other special equipment or environment.
+        # so, the following conditional statements are intentionally arranged.
+        if matched == True:
+            tfname = os.path.join(SAMPLECODE_TEMPDIR, '{}_example{}'.format(
+                name, '.py'
+                if len(codeblocks) == 1 else '_{}.py'.format(y + 1)))
+            with open(tfname, 'w') as tempf:
+                sampcd = insert_codes_into_codeblock(cb, name)
+                tempf.write(sampcd)
+            sample_code_filenames.append(tfname)
+        elif matched is None:
+            logger.info('{}\' code block (name:{}, id:{}) is skipped.'.format(
+                name, cb['name'], cb['id']))
+            SUMMARY_INFO['skiptest'].append("{}-{}".format(name, cb['id']))
+        elif matched == False:
+            logger.info(
+                '{}\' code block (name:{}, id:{}) required({}) not match capacity({}).'.
+                format(name, cb['name'], cb['id'], cb['required'],
+                       SAMPLE_CODE_TEST_CAPACITY))
+            if cb['required'] not in SUMMARY_INFO:
+                SUMMARY_INFO[cb['required']] = []
+            SUMMARY_INFO[cb['required']].append("{}-{}".format(name, cb['id']))
+
     return sample_code_filenames
 
 
 def execute_samplecode(tfname):
     """
-    Execute a sample-code test.
+    Execute a sample-code test
 
     Args:
-        tfname: the filename of the samplecode.
+        tfname: the filename of the sample code
     
     Returns:
         result: success or not
         tfname: same as the input argument
-        msg: the stdout output of the samplecode executing.
+        msg: the stdout output of the sample code executing
+        time: time consumed by sample code
     """
     result = True
     msg = None
     if platform.python_version()[0] in ["2", "3"]:
         cmd = [sys.executable, tfname]
     else:
-        print("Error: fail to parse python version!")
+        logger.error("Error: fail to parse python version!")
         result = False
         exit(1)
 
-    # check required envisonment
-    with open(tfname, 'r') as f:
-        for line in f.readlines():
-            if re.match(r'#\s*required\s*:\s*(distributed|gpu|skip)', line):
-                result = True
-                return result, tfname, '{} is skipped. cause: {}'.format(tfname,
-                                                                         line)
-
-    logging.info('running %s', tfname)
-    print("\n----example code check----")
-    print("executing sample code .....", tfname)
+    logger.info("----example code check----")
+    logger.info("executing sample code: %s", tfname)
+    start_time = time.time()
     subprc = subprocess.Popen(
         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     output, error = subprc.communicate()
     msg = "".join(output.decode(encoding='utf-8'))
     err = "".join(error.decode(encoding='utf-8'))
+    end_time = time.time()
 
     if subprc.returncode != 0:
-        print("Sample code error found in ", tfname, ":")
-        print("-----------------------")
-        print(open(tfname).read())
-        print("-----------------------")
-        print("subprocess return code: ", str(subprc.returncode))
-        print("Error Raised from Sample Code ", tfname, " :")
-        print(err)
-        print(msg)
-        print("----example code check failed----\n")
-        logging.warning('%s error: %s', tfname, err)
-        logging.warning('%s msg: %s', tfname, msg)
+        with open(tfname, 'r') as f:
+            logger.warning("""Sample code error found in %s:
+-----------------------
+%s
+-----------------------
+subprocess return code: %d
+Error Raised from Sample Code:
+stderr: %s
+stdout: %s
+""", tfname, f.read(), subprc.returncode, err, msg)
+        logger.info("----example code check failed----")
         result = False
     else:
-        print("----example code check success----\n")
+        logger.info("----example code check success----")
 
     # msg is the returned code execution report
-    return result, tfname, msg
+    return result, tfname, msg, end_time - start_time
 
 
 def get_filenames():
@@ -317,35 +508,6 @@ def get_incrementapi():
                 f.write('\n')
 
 
-def get_wlist(fn="wlist.json"):
-    '''
-    this function will get the white list of API.
-
-    Returns:
-
-        wlist: a list of API that should not trigger the example check .
-
-    '''
-    wlist = []
-    wlist_file = []
-    # only white on CPU
-    gpu_not_white = []
-    with open(fn, 'r') as load_f:
-        load_dict = json.load(load_f)
-        for key in load_dict:
-            if key == 'wlist_dir':
-                for item in load_dict[key]:
-                    wlist_file.append(item["name"])
-            elif key == "gpu_not_white":
-                gpu_not_white = load_dict[key]
-            elif key == "wlist_api":
-                for item in load_dict[key]:
-                    wlist.append(item["name"])
-            else:
-                wlist = wlist + load_dict[key]
-    return wlist, wlist_file, gpu_not_white
-
-
 arguments = [
     # flags, dest, type, default, help
     ['--gpu_id', 'gpu_id', int, 0, 'GPU device id to use [0]'],
@@ -391,18 +553,15 @@ if __name__ == '__main__':
             ))
         logger.addHandler(logfHandler)
 
-    wlist, wlist_file, gpu_not_white = get_wlist()
-
     if args.mode == "gpu":
         GPU_ID = args.gpu_id
         logger.info("using GPU_ID %d", GPU_ID)
-        for _gnw in gpu_not_white:
-            wlist.remove(_gnw)
     elif args.mode != "cpu":
         logger.error("Unrecognized argument:%s, 'cpu' or 'gpu' is desired.",
                      args.mode)
         sys.exit("Invalid arguments")
     RUN_ON_DEVICE = args.mode
+    get_test_capacity()
     logger.info("API check -- Example Code")
     logger.info("sample_test running under python %s",
                 platform.python_version())
@@ -449,19 +608,50 @@ if __name__ == '__main__':
             if not temp[0]:
                 logger.info("In addition, mistakes found in sample codes: %s",
                             temp[1])
-                logger.info("error_methods: %s", str(temp[2]))
         logger.info("----------------------------------------------------")
         exit(1)
     else:
-        has_error = False
+        timeovered_test = {}
         for temp in result:
             if not temp[0]:
                 logger.info("In addition, mistakes found in sample codes: %s",
                             temp[1])
-                logger.info("error_methods: %s", str(temp[2]))
-                has_error = True
-        if has_error:
-            logger.info("Mistakes found in sample codes.")
-            logger.info("Please check sample codes.")
+                SUMMARY_INFO['failed'].append(temp[1])
+            else:
+                SUMMARY_INFO['success'].append(temp[1])
+            if temp[3] > 10:
+                timeovered_test[temp[1]] = temp[3]
+
+        if len(timeovered_test):
+            logger.info("%d sample codes ran time over 10s",
+                        len(timeovered_test))
+            if args.debug:
+                for k, v in timeovered_test.items():
+                    logger.info('{} - {}s'.format(k, v))
+        if len(SUMMARY_INFO['success']):
+            logger.info("%d sample codes ran success",
+                        len(SUMMARY_INFO['success']))
+        for k, v in SUMMARY_INFO.items():
+            if k not in ['success', 'failed', 'skiptest', 'nocodes']:
+                logger.info("%d sample codes required not match for %s",
+                            len(v), k)
+        if len(SUMMARY_INFO['skiptest']):
+            logger.info("%d sample codes skipped",
+                        len(SUMMARY_INFO['skiptest']))
+            if args.debug:
+                logger.info('\n'.join(SUMMARY_INFO['skiptest']))
+        if len(SUMMARY_INFO['nocodes']):
+            logger.info("%d apis don't have sample codes",
+                        len(SUMMARY_INFO['nocodes']))
+            if args.debug:
+                logger.info('\n'.join(SUMMARY_INFO['nocodes']))
+        if len(SUMMARY_INFO['failed']):
+            logger.info("%d sample codes ran failed",
+                        len(SUMMARY_INFO['failed']))
+            logger.info('\n'.join(SUMMARY_INFO['failed']))
+            logger.info(
+                "Mistakes found in sample codes. Please recheck the sample codes."
+            )
             exit(1)
+
     logger.info("Sample code check is successful!")
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 15bcae826064d981eee0972ea10ed1294cd9d5c5..2c50c4bf9f6207d3eea5f917d68bdd218cb07aeb 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -447,6 +447,8 @@ STATIC_MODE_TESTING_LIST = [
     'test_sample_logits_op',
     'test_save_model_without_var',
     'test_scale_op',
+    'test_scale_mkldnn_op',
+    'test_scale_bf16_mkldnn_op',
     'test_scaled_dot_product_attention',
     'test_scatter_nd_op',
     'test_seed_op',
@@ -587,6 +589,7 @@ STATIC_MODE_TESTING_LIST = [
     'test_matmul_op_with_head',
     'test_var_conv_2d',
     'test_batch_norm_mkldnn_op',
+    'test_cast_mkldnn_op',
     'test_concat_int8_mkldnn_op',
     'test_concat_bf16_mkldnn_op',
     'test_concat_mkldnn_op',
diff --git a/tools/test_model_benchmark.sh b/tools/test_model_benchmark.sh
index 5ec71ef8c114d35c4d41832221967298bc0c217f..98066d7beeaa7777d4ba6be90bba512264185ad0 100644
--- a/tools/test_model_benchmark.sh
+++ b/tools/test_model_benchmark.sh
@@ -24,11 +24,13 @@ function check_whl {
 
     mkdir -p /tmp/pr && mkdir -p /tmp/develop
     unzip -q build/python/dist/*.whl -d /tmp/pr
+    rm -f build/python/dist/*.whl && rm -f build/python/build/.timestamp
 
     git checkout .
     git checkout -b develop_base_pr upstream/$BRANCH
+    bash -x paddle/scripts/paddle_build.sh build
+    [ $? -ne 0 ] && echo "install paddle failed." && exit 1
     cd build
-    make -j `nproc`
     unzip -q python/dist/*.whl -d /tmp/develop
 
     sed -i '/version.py/d' /tmp/pr/*/RECORD
@@ -36,7 +38,10 @@ function check_whl {
     diff_whl=`diff /tmp/pr/*/RECORD /tmp/develop/*/RECORD|wc -l`
     if [ ${diff_whl} -eq 0 ];then
         echo "paddle whl does not diff in PR-CI-Model-benchmark, so skip this ci"
+        echo "ipipe_log_param_isSkipTest_model_benchmark: 1" 
         exit 0
+    else
+        echo "ipipe_log_param_isSkipTest_model_benchmark: 0"
     fi
 }
 
diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py
index 7836728247f50c50e90a4aa15fb78cd6f0a2efa8..81710dae16764229ddeb00bd3e66480c2dbef7fe 100644
--- a/tools/test_sampcd_processor.py
+++ b/tools/test_sampcd_processor.py
@@ -20,15 +20,18 @@ import tempfile
 import shutil
 import sys
 import importlib
+import re
+import sampcd_processor
 from sampcd_processor import find_all
-from sampcd_processor import check_indent
 from sampcd_processor import get_api_md5
 from sampcd_processor import get_incrementapi
-from sampcd_processor import get_wlist
 from sampcd_processor import sampcd_extract_to_file
+from sampcd_processor import extract_code_blocks_from_docstr
 from sampcd_processor import execute_samplecode
-
-SAMPLECODE_TEMP_DIR = 'samplecode_temp'
+from sampcd_processor import find_last_future_line_end
+from sampcd_processor import insert_codes_into_codeblock
+from sampcd_processor import get_test_capacity
+from sampcd_processor import is_required_match
 
 
 class Test_find_all(unittest.TestCase):
@@ -43,27 +46,246 @@ class Test_find_all(unittest.TestCase):
                              find_all(' hello, world; hello paddle!', 'hello'))
 
 
-class Test_check_indent(unittest.TestCase):
-    def test_no_indent(self):
-        self.assertEqual(0, check_indent('hello paddle'))
+class Test_find_last_future_line_end(unittest.TestCase):
+    def test_no_instant(self):
+        samplecodes = """
+                print(10//3)
+        """
+        self.assertIsNone(find_last_future_line_end(samplecodes))
+
+    def test_1_instant(self):
+        samplecodes = """
+                from __future__ import print_function
+
+                print(10//3)
+        """
+        mo = re.search("print_function\n", samplecodes)
+        self.assertIsNotNone(mo)
+        self.assertGreaterEqual(
+            find_last_future_line_end(samplecodes), mo.end())
+
+    def test_2_instant(self):
+        samplecodes = """
+                from __future__ import print_function
+                from __future__ import division
+
+                print(10//3)
+        """
+        mo = re.search("division\n", samplecodes)
+        self.assertIsNotNone(mo)
+        self.assertGreaterEqual(
+            find_last_future_line_end(samplecodes), mo.end())
+
+
+class Test_extract_code_blocks_from_docstr(unittest.TestCase):
+    def test_no_samplecode(self):
+        docstr = """
+        placeholder
+        """
+        codeblocks = extract_code_blocks_from_docstr(docstr)
+        self.assertListEqual([], codeblocks)
+
+    def test_codeblock_before_examples_is_ignored(self):
+        docstr = """
+            .. code-block:: python
+
+                print(1+1)
+        Examples:
+        """
+        codeblocks = extract_code_blocks_from_docstr(docstr)
+        self.assertListEqual(codeblocks, [])
+
+    def test_1_samplecode(self):
+        docstr = """
+        Examples:
+            .. code-block:: python
+
+                print(1+1)
+        """
+        codeblocks = extract_code_blocks_from_docstr(docstr)
+        self.assertListEqual(codeblocks, [{
+            'codes': """print(1+1)""",
+            'name': None,
+            'id': 1,
+            'required': None,
+        }])
+
+    def test_2_samplecodes(self):
+        docstr = """
+        placeholder
+        Examples:
+            .. code-block:: python
+
+                print(1/0)
+
+            .. code-block:: python
+               :name: one_plus_one
+               :linenos:
+
+                # required: gpu
+                print(1+1)
+        """
+        codeblocks = extract_code_blocks_from_docstr(docstr)
+        self.assertListEqual(codeblocks, [{
+            'codes': """print(1/0)""",
+            'name': None,
+            'id': 1,
+            'required': None,
+        }, {
+            'codes': """# required: gpu
+print(1+1)""",
+            'name': 'one_plus_one',
+            'id': 2,
+            'required': 'gpu',
+        }])
+
+
+class Test_insert_codes_into_codeblock(unittest.TestCase):
+    def test_required_None(self):
+        codeblock = {
+            'codes': """print(1/0)""",
+            'name': None,
+            'id': 1,
+            'required': None,
+        }
+        self.assertEqual("""
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+print(1/0)
+print("not-specified's sample code (name:None, id:1) is executed successfully!")""",
+                         insert_codes_into_codeblock(codeblock))
+
+    def test_required_gpu(self):
+        codeblock = {
+            'codes': """# required: gpu
+print(1+1)""",
+            'name': None,
+            'id': 1,
+            'required': 'gpu',
+        }
+        self.assertEqual("""
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+# required: gpu
+print(1+1)
+print("not-specified's sample code (name:None, id:1) is executed successfully!")""",
+                         insert_codes_into_codeblock(codeblock))
+
+    def test_from_future(self):
+        codeblock = {
+            'codes': """
+from __future__ import print_function
+from __future__ import division
+print(10//3)""",
+            'name': 'future',
+            'id': 1,
+            'required': None,
+        }
+        self.assertEqual("""
+from __future__ import print_function
+from __future__ import division
+
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+print(10//3)
+print("not-specified's sample code (name:future, id:1) is executed successfully!")""",
+                         insert_codes_into_codeblock(codeblock))
+
+
+def clear_capacity():
+    sampcd_processor.SAMPLE_CODE_TEST_CAPACITY = set()
+    sampcd_processor.RUN_ON_DEVICE = 'cpu'
+    if sampcd_processor.ENV_KEY_TEST_CAPACITY in os.environ:
+        del os.environ[sampcd_processor.ENV_KEY_TEST_CAPACITY]
 
-    def test_indent_4_spaces(self):
-        self.assertEqual(4, check_indent('    hello paddle'))
 
-    def test_indent_1_tab(self):
-        self.assertEqual(4, check_indent("\thello paddle"))
+class Test_get_test_capacity(unittest.TestCase):
+    def setUp(self):
+        clear_capacity()
+        get_test_capacity()
+
+    def tearDown(self):
+        clear_capacity()
+        get_test_capacity()
+
+    def test_NoEnvVar(self):
+        clear_capacity()
+        get_test_capacity()
+        self.assertCountEqual(['cpu', ],
+                              sampcd_processor.SAMPLE_CODE_TEST_CAPACITY)
+
+    def test_NoEnvVar_RUN_ON_DEVICE_gpu(self):
+        clear_capacity()
+        sampcd_processor.RUN_ON_DEVICE = 'gpu'
+        get_test_capacity()
+        self.assertCountEqual(['cpu', 'gpu'],
+                              sampcd_processor.SAMPLE_CODE_TEST_CAPACITY)
+
+    def test_EnvVar_gpu(self):
+        clear_capacity()
+        os.environ[sampcd_processor.ENV_KEY_TEST_CAPACITY] = 'gpu'
+        get_test_capacity()
+        self.assertCountEqual(['cpu', 'gpu'],
+                              sampcd_processor.SAMPLE_CODE_TEST_CAPACITY)
+
+    def test_EnvVar_gpu_and_distributed(self):
+        clear_capacity()
+        os.environ[sampcd_processor.ENV_KEY_TEST_CAPACITY] = 'gpu,distributed'
+        get_test_capacity()
+        self.assertCountEqual(['cpu', 'gpu', 'distributed'],
+                              sampcd_processor.SAMPLE_CODE_TEST_CAPACITY)
+
+
+class Test_is_required_match(unittest.TestCase):
+    def setUp(self):
+        clear_capacity()
+
+    def tearDown(self):
+        clear_capacity()
+        get_test_capacity()
+
+    def test_alldefault(self):
+        clear_capacity()
+        get_test_capacity()
+        self.assertTrue(is_required_match(''))
+        self.assertTrue(is_required_match(None))
+        self.assertTrue(is_required_match('cpu'))
+        self.assertFalse(is_required_match('gpu'))
+        self.assertIsNone(is_required_match('skiptest'))
+        self.assertIsNone(is_required_match('skip'))
+        self.assertIsNone(is_required_match('cpu,skiptest'))
+
+    def test_gpu_equipped(self):
+        clear_capacity()
+        os.environ[sampcd_processor.ENV_KEY_TEST_CAPACITY] = 'gpu'
+        get_test_capacity()
+        self.assertTrue(is_required_match('cpu'))
+        self.assertTrue(is_required_match('gpu'))
+        self.assertTrue(is_required_match('gpu,cpu'))
+        self.assertIsNone(is_required_match('skiptest'))
+        self.assertFalse(is_required_match('distributed'))
+
+    def test_gpu_distributed_equipped(self):
+        clear_capacity()
+        os.environ[sampcd_processor.ENV_KEY_TEST_CAPACITY] = 'gpu,distributed'
+        get_test_capacity()
+        self.assertTrue(is_required_match('cpu'))
+        self.assertTrue(is_required_match('gpu'))
+        self.assertTrue(is_required_match('distributed'))
+        self.assertFalse(is_required_match('xpu'))
+        self.assertIsNone(is_required_match('skiptest'))
 
 
 class Test_execute_samplecode(unittest.TestCase):
     def setUp(self):
-        if not os.path.exists(SAMPLECODE_TEMP_DIR):
-            os.mkdir(SAMPLECODE_TEMP_DIR)
-        self.successSampleCodeFile = os.path.join(SAMPLECODE_TEMP_DIR,
-                                                  'samplecode_success.py')
+        if not os.path.exists(sampcd_processor.SAMPLECODE_TEMPDIR):
+            os.mkdir(sampcd_processor.SAMPLECODE_TEMPDIR)
+        self.successSampleCodeFile = os.path.join(
+            sampcd_processor.SAMPLECODE_TEMPDIR, 'samplecode_success.py')
         with open(self.successSampleCodeFile, 'w') as f:
             f.write('print(1+1)')
-        self.failedSampleCodeFile = os.path.join(SAMPLECODE_TEMP_DIR,
-                                                 'samplecode_failed.py')
+        self.failedSampleCodeFile = os.path.join(
+            sampcd_processor.SAMPLECODE_TEMPDIR, 'samplecode_failed.py')
         with open(self.failedSampleCodeFile, 'w') as f:
             f.write('print(1/0)')
 
@@ -72,37 +294,41 @@ class Test_execute_samplecode(unittest.TestCase):
         os.remove(self.failedSampleCodeFile)
 
     def test_run_success(self):
-        result, tfname, msg = execute_samplecode(self.successSampleCodeFile)
+        result, tfname, msg, exec_time = execute_samplecode(
+            self.successSampleCodeFile)
         self.assertTrue(result)
         self.assertEqual(self.successSampleCodeFile, tfname)
         self.assertIsNotNone(msg)
         self.assertLess(msg.find('skipped'), 0)
+        self.assertLess(exec_time, 10)
 
     def test_run_failed(self):
-        result, tfname, msg = execute_samplecode(self.failedSampleCodeFile)
+        result, tfname, msg, exec_time = execute_samplecode(
+            self.failedSampleCodeFile)
         self.assertFalse(result)
         self.assertEqual(self.failedSampleCodeFile, tfname)
         self.assertIsNotNone(msg)
         self.assertLess(msg.find('skipped'), 0)
+        self.assertLess(exec_time, 10)
 
-    def test_testcases_skipped(self):
-        ...
-        tfname = os.path.join(SAMPLECODE_TEMP_DIR, 'samplecode_skipped.py')
-        with open(tfname, 'w') as f:
-            f.write("# required: distributed\nprint(1/0)")
-        result, _, msg = execute_samplecode(tfname)
-        self.assertTrue(result)
-        self.assertGreaterEqual(msg.find('skipped'), 0)
-        os.remove(tfname)
+
+def clear_summary_info():
+    for k in sampcd_processor.SUMMARY_INFO.keys():
+        sampcd_processor.SUMMARY_INFO[k].clear()
 
 
 class Test_sampcd_extract_to_file(unittest.TestCase):
     def setUp(self):
-        if not os.path.exists(SAMPLECODE_TEMP_DIR):
-            os.mkdir(SAMPLECODE_TEMP_DIR)
+        if not os.path.exists(sampcd_processor.SAMPLECODE_TEMPDIR):
+            os.mkdir(sampcd_processor.SAMPLECODE_TEMPDIR)
+        clear_capacity()
+        os.environ[sampcd_processor.ENV_KEY_TEST_CAPACITY] = 'gpu,distributed'
+        get_test_capacity()
 
     def tearDown(self):
-        shutil.rmtree(SAMPLECODE_TEMP_DIR)
+        shutil.rmtree(sampcd_processor.SAMPLECODE_TEMPDIR)
+        clear_capacity()
+        get_test_capacity()
 
     def test_1_samplecode(self):
         comments = """
@@ -113,9 +339,10 @@ class Test_sampcd_extract_to_file(unittest.TestCase):
         """
         funcname = 'one_plus_one'
         sample_code_filenames = sampcd_extract_to_file(comments, funcname)
-        self.assertCountEqual(
-            [os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example.py')],
-            sample_code_filenames)
+        self.assertCountEqual([
+            os.path.join(sampcd_processor.SAMPLECODE_TEMPDIR,
+                         funcname + '_example.py')
+        ], sample_code_filenames)
 
     def test_no_samplecode(self):
         comments = """
@@ -140,10 +367,64 @@ class Test_sampcd_extract_to_file(unittest.TestCase):
         funcname = 'one_plus_one'
         sample_code_filenames = sampcd_extract_to_file(comments, funcname)
         self.assertCountEqual([
-            os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example_1.py'),
-            os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example_2.py')
+            os.path.join(sampcd_processor.SAMPLECODE_TEMPDIR,
+                         funcname + '_example_1.py'),
+            os.path.join(sampcd_processor.SAMPLECODE_TEMPDIR,
+                         funcname + '_example_2.py')
         ], sample_code_filenames)
 
+    def test_2_samplecodes_has_skipped(self):
+        comments = """
+        placeholder
+        Examples:
+            .. code-block:: python
+
+                # required: skiptest
+                print(1/0)
+
+            .. code-block:: python
+
+                print(1+1)
+
+            .. code-block:: python
+
+                # required: gpu
+                print(1//1)
+
+            .. code-block:: python
+
+                # required: xpu
+                print(1//1)
+
+            .. code-block:: python
+
+                # required: distributed
+                print(1//1)
+
+            .. code-block:: python
+
+                # required: gpu
+                print(1//1)
+        """
+        funcname = 'one_plus_one'
+        clear_summary_info()
+        clear_capacity()
+        get_test_capacity()
+
+        sample_code_filenames = sampcd_extract_to_file(comments, funcname)
+        self.assertCountEqual([
+            os.path.join(sampcd_processor.SAMPLECODE_TEMPDIR,
+                         funcname + '_example_2.py')
+        ], sample_code_filenames)
+        self.assertCountEqual(sampcd_processor.SUMMARY_INFO['skiptest'],
+                              [funcname + '-1'])
+        self.assertCountEqual(sampcd_processor.SUMMARY_INFO['gpu'],
+                              [funcname + '-3', funcname + '-6'])
+        self.assertCountEqual(sampcd_processor.SUMMARY_INFO['xpu'],
+                              [funcname + '-4'])
+        self.assertCountEqual(sampcd_processor.SUMMARY_INFO['distributed'],
+                              [funcname + '-5'])
+
 
 class Test_get_api_md5(unittest.TestCase):
     def setUp(self):
@@ -208,55 +489,6 @@ class Test_get_incrementapi(unittest.TestCase):
             ], lines)
 
 
-class Test_get_wlist(unittest.TestCase):
-    def setUp(self):
-        self.tmpDir = tempfile.mkdtemp()
-        self.wlist_filename = os.path.join(self.tmpDir, 'wlist.json')
-        with open(self.wlist_filename, 'w') as f:
-            f.write(r'''
-{
-    "wlist_dir":[
-        {
-            "name":"../python/paddle/fluid/contrib",
-            "annotation":""
-        },
-        {
-            "name":"../python/paddle/verison.py",
-            "annotation":""
-        }
-    ],
-    "wlist_api":[
-        {
-            "name":"xxxxx",
-            "annotation":"not a real api, just for example"
-        }
-    ],
-    "wlist_temp_api":[
-        "to_tensor",
-        "save_persistables@dygraph/checkpoint.py"
-    ],
-    "gpu_not_white":[
-        "deformable_conv"
-    ]
-}
-''')
-
-    def tearDown(self):
-        os.remove(self.wlist_filename)
-        shutil.rmtree(self.tmpDir)
-
-    def test_get_wlist(self):
-        wlist, wlist_file, gpu_not_white = get_wlist(self.wlist_filename)
-        self.assertCountEqual(
-            ["xxxxx", "to_tensor",
-             "save_persistables@dygraph/checkpoint.py"], wlist)
-        self.assertCountEqual([
-            "../python/paddle/fluid/contrib",
-            "../python/paddle/verison.py",
-        ], wlist_file)
-        self.assertCountEqual(["deformable_conv"], gpu_not_white)
-
-
 # https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/ops.py
 # why? unabled to use the ast module. emmmmm
 
diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
index 4a61a99c34fa24186e51317e4e1432e64f953d8b..603c9911a44f990e3aaaaec37b2d93ddd2c613d3 100644
--- a/tools/windows/build_compile_environment.bat
+++ b/tools/windows/build_compile_environment.bat
@@ -132,7 +132,7 @@ goto :eof
 :vs
 echo ">>>>>>>> step [4/7]: Visual Studio 2017 "
 cmd /C "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"  > nul 2> nul || call :install_visual_studio
-goto :cuda10
+goto :cuda
 
 :install_visual_studio
 echo There is not Visual Studio in this PC, will install VS2017.
@@ -153,7 +153,7 @@ goto :eof
 :: ===== end step 4: Visual Studio 2017 =====
 
 :: ===== start step 5: CUDA 11 =====
-:cuda10
+:cuda
 echo ">>>>>>>> step [5/7]: CUDA 11.2"
 cmd /C nvcc --version 2> nul | findstr /C:"11.2" > nul 2> nul || call :install_cuda
 goto java-jre
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index a89dcb61fb7e327cc362c7ffa28eaca14a81112f..68d7ef336edba6ee805eb83753d8cd0d0ac383f6 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -69,6 +69,7 @@ disable_wingpu_test="^test_model$|\
 ^test_py_reader_pin_memory$|\
 ^test_py_reader_push_pop$|\
 ^test_reader_reset$|\
+^test_imperative_se_resnext$|\
 ^test_sync_batch_norm_op$|\
 ^test_imperative_static_runner_while$|\
 ^test_dataloader_keep_order$|\
@@ -345,6 +346,7 @@ if [ "${WITH_GPU:-OFF}" == "ON" ];then
     if [ -f "$PADDLE_ROOT/added_ut" ];then
         added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$
         ctest -R "(${added_uts})" --output-on-failure -C Release --repeat-until-fail 3;added_ut_error=$?
+        rm -f $PADDLE_ROOT/added_ut
         if [ "$added_ut_error" != 0 ];then
             echo "========================================"
             echo "Added UT should pass three additional executions"
diff --git a/tools/wlist.json b/tools/wlist.json
deleted file mode 100644
index 5a83a9ee47004adcfbc41129a68b8734f71fd6cd..0000000000000000000000000000000000000000
--- a/tools/wlist.json
+++ /dev/null
@@ -1,505 +0,0 @@
-{
-    "wlist_dir":[
-        {
-            "name":"../python/paddle/fluid/contrib",
-            "annotation":""
-        },
-        {
-            "name":"../python/paddle/verison.py",
-            "annotation":""
-        },
-        {
-            "name":"../python/paddle/fluid/core_avx.py",
-            "annotation":""
-        },
-        {
-            "name":"../python/paddle/distributed",
-            "annotation":""
-        }
-    ],
-    "wlist_api":[
-        {
-            "name":"xxxxx",
-            "annotation":"not a real api, just for example"
-        },
-        {
-            "name":"squeeze_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"unsqueeze_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"reshape_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"flatten_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"scatter_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"elu_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"relu_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"softmax_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"tanh_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"ceil_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"floor_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"exp_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"reciprocal_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"round_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"sqrt_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"rsqrt_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"clip_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"scale_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"subtract_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"add_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        }
-    ],
-    "wlist_temp_api":[
-        "to_tensor",
-        "LRScheduler",
-        "ReduceOnPlateau",
-        "append_LARS",
-        "BuildStrategy.debug_graphviz_path",
-        "BuildStrategy.enable_sequential_execution",
-        "BuildStrategy.fuse_elewise_add_act_ops",
-        "BuildStrategy.fuse_relu_depthwise_conv",
-        "BuildStrategy.gradient_scale_strategy",
-        "BuildStrategy.reduce_strategy",
-        "BuildStrategy.remove_unnecessary_lock",
-        "BuildStrategy.sync_batch_norm",
-        "DynamicRNN.step_input",
-        "DynamicRNN.static_input",
-        "DynamicRNN.block",
-        "DynamicRNN.update_memory",
-        "DynamicRNN.output",
-        "transpiler.DistributeTranspilerConfig",
-        "transpiler.DistributeTranspilerConfig.slice_var_up",
-        "transpiler.DistributeTranspilerConfig.split_method",
-        "transpiler.DistributeTranspilerConfig.min_block_size",
-        "DistributeTranspilerConfig.slice_var_up",
-        "DistributeTranspilerConfig.split_method",
-        "ModelAverage.apply",
-        "ModelAverage.restore",
-        "DistributeTranspilerConfig",
-        "DistributeTranspilerConfig.min_block_size",
-        "ExecutionStrategy.allow_op_delay",
-        "load",
-        "Accuracy.update",
-        "ChunkEvaluator.update",
-        "ExecutionStrategy.num_iteration_per_drop_scope",
-        "ExecutionStrategy.num_threads",
-        "CompiledProgram._with_inference_optimize",
-        "CompositeMetric.add_metric",
-        "CompositeMetric.update",
-        "CompositeMetric.eval",
-        "DetectionMAP.get_map_var",
-        "MetricBase",
-        "MetricBase.reset",
-        "MetricBase.get_config",
-        "MetricBase.update",
-        "MetricBase.eval",
-        "Accuracy.eval",
-        "Auc.update",
-        "Auc.eval",
-        "EditDistance.update",
-        "EditDistance.eval",
-        "ExponentialMovingAverage.apply",
-        "ExponentialMovingAverage.restore",
-        "ExponentialMovingAverage.update",
-        "StaticRNN.step",
-        "StaticRNN.step_input",
-        "StaticRNN.step_output",
-        "StaticRNN.update_memory",
-        "DetectionMAP.reset",
-        "StaticRNN.output",
-        "cuda_places",
-        "CUDAPinnedPlace",
-        "CUDAPlace",
-        "Program.parse_from_string",
-        "Compressor",
-        "Compressor.config",
-        "Compressor.run",
-        "HDFSClient.upload",
-        "HDFSClient.download",
-        "HDFSClient.is_exist",
-        "HDFSClient.is_dir",
-        "HDFSClient.delete",
-        "HDFSClient.rename",
-        "HDFSClient.makedirs",
-        "HDFSClient.ls",
-        "HDFSClient.lsr",
-        "multi_download",
-        "multi_upload",
-        "TrainingDecoder.block",
-        "QuantizeTranspiler.training_transpile",
-        "QuantizeTranspiler.freeze_program",
-        "AutoMixedPrecisionLists",
-        "Uniform.sample",
-        "Uniform.log_prob",
-        "Uniform.entropy",
-        "Categorical.kl_divergence",
-        "Categorical.entropy",
-        "MultivariateNormalDiag.entropy",
-        "MultivariateNormalDiag.kl_divergence",
-        "RNNCell",
-        "RNNCell.call",
-        "RNNCell.get_initial_states",
-        "GRUCell.call",
-        "LSTMCell.call",
-        "Decoder",
-        "Decoder.initialize",
-        "Decoder.step",
-        "Decoder.finalize",
-        "fused_elemwise_activation",
-        "search_pyramid_hash",
-        "convert_dist_to_sparse_program",
-        "load_persistables_for_increment",
-        "load_persistables_for_inference",
-        "xmap_readers",
-        "Metric.reset",
-        "Metric.update",
-        "Metric.accumulate",
-        "Metric.name",
-        "Metric.compute",
-        "Accuracy.reset",
-        "Accuracy.update",
-        "Accuracy.accumulate",
-        "Accuracy.name",
-        "Accuracy.compute",
-        "Precision.reset",
-        "Precision.update",
-        "Precision.accumulate",
-        "Precision.name",
-        "Precision.compute",
-        "Recall.reset",
-        "Recall.update",
-        "Recall.accumulate",
-        "Recall.name",
-        "Recall.compute",
-        "Auc.reset",
-        "Auc.update",
-        "Auc.accumulate",
-        "Auc.name",
-        "Auc.compute",
-        "Callback.set_params",
-        "Callback.on_train_begin",
-        "Callback.on_train_end",
-        "Callback.on_eval_begin",
-        "Callback.on_eval_end",
-        "Callback.on_test_begin",
-        "Callback.on_test_end",
-        "Callback.on_epoch_begin",
-        "Callback.on_epoch_end",
-        "Callback.on_train_batch_begin",
-        "Callback.on_train_batch_end",
-        "Callback.on_eval_batch_begin",
-        "Callback.on_eval_batch_end",
-        "Callback.on_test_batch_begin",
-        "Callback.on_test_batch_end",
-        "Model.prepare",
-        "SimpleRNNCell",
-        "SimpleRNNCell.forward",
-        "LSTMCell",
-        "LSTMCell.forward",
-        "GRUCell",
-        "GRUCell.forward",
-        "SimpleRNN",
-        "GRU",
-        "LSTM",
-        "RNN",
-        "BiRNN",
-        "RNNCellBase",
-        "RNNCellBase.get_initial_states",
-        "gelu",
-        "erf",
-        "DecodeHelper",
-        "DecodeHelper.initialize",
-        "DecodeHelper.sample",
-        "DecodeHelper.next_inputs",
-        "TrainingHelper.initialize",
-        "TrainingHelper.sample",
-        "TrainingHelper.next_inputs",
-        "GreedyEmbeddingHelper.initialize",
-        "GreedyEmbeddingHelper.sample",
-        "GreedyEmbeddingHelper.next_inputs",
-        "LayerList.append",
-        "HDFSClient",
-        "InitState",
-        "TracedLayer",
-        "SampleEmbeddingHelper.sample",
-        "BasicDecoder.initialize",
-        "BasicDecoder.step",
-        "ParameterList.append",
-        "GreedyEmbeddingHelper",
-        "SampleEmbeddingHelper",
-        "BasicDecoder",
-        "lstm",
-        "partial_sum",
-        "StateCell",
-        "StateCell.compute_state",
-        "TrainingDecoder",
-        "TrainingDecoder.step_input",
-        "TrainingDecoder.static_input",
-        "TrainingDecoder.output",
-        "BeamSearchDecoder",
-        "GradClipByValue",
-        "GradClipByNorm",
-        "Variable.detach",
-        "Variable.numpy",
-        "Variable.set_value",
-        "Variable.gradient",
-        "BeamSearchDecoder.decode",
-        "BeamSearchDecoder.read_array",
-        "CompiledProgram",
-        "CompiledProgram.with_data_parallel",
-        "append_backward",
-        "guard",
-        "to_variable",
-        "op_freq_statistic",
-        "save_dygraph",
-        "load_dygraph",
-        "ParallelExecutor",
-        "ParallelExecutor.run",
-        "ParallelExecutor.drop_local_exe_scopes",
-        "GradClipByGlobalNorm",
-        "extend_with_decoupled_weight_decay",
-        "switch",
-        "Normal",
-        "memory_usage",
-        "decorate",
-        "PiecewiseDecay",
-        "InverseTimeDecay",
-        "PolynomialDecay",
-        "NoamDecay",
-        "start_profiler",
-        "profiler",
-        "tree_conv",
-        "multiclass_nms2",
-        "DataFeedDesc",
-        "Conv2D",
-        "Conv3D",
-        "Conv3DTranspose",
-        "Embedding",
-        "NCE",
-        "PRelu",
-        "BilinearTensorProduct",
-        "GroupNorm",
-        "SpectralNorm",
-        "TreeConv",
-        "prroi_pool",
-        "ChunkEvaluator",
-        "EditDistance",
-        "ErrorClipByValue",
-        "Program.clone",
-        "cuda_pinned_places",
-        "DataFeeder",
-        "elementwise_floordiv",
-        "Layer",
-        "Layer.create_parameter",
-        "Layer.create_variable",
-        "Layer.sublayers",
-        "Layer.add_parameter",
-        "Layer.add_sublayer",
-        "Layer.parameters",
-        "Tracer",
-        "Layer.full_name",
-        "InMemoryDataset",
-        "layer_norm",
-        "bipartite_match",
-        "double_buffer",
-        "cumsum",
-        "thresholded_relu",
-        "group_norm",
-        "random_crop",
-        "row_conv",
-        "hard_shrink",
-        "ssd_loss",
-        "retinanet_target_assign",
-        "InMemoryDataset.global_shuffle",
-        "InMemoryDataset.get_memory_data_size",
-        "DetectionMAP",
-        "hash",
-        "InMemoryDataset.set_queue_num",
-        "LayerNorm",
-        "Preprocessor",
-        "chunk_eval",
-        "GRUUnit",
-        "ExponentialMovingAverage",
-        "QueueDataset.global_shuffle",
-        "NumpyArrayInitializer",
-        "create_py_reader_by_data",
-        "InMemoryDataset.local_shuffle",
-        "InMemoryDataset.get_shuffle_data_size",
-        "size",
-        "edit_distance",
-        "nce",
-        "BilinearInitializer",
-        "NaturalExpDecay",
-        "noam_decay",
-        "retinanet_detection_output",
-        "Pool2D",
-        "PipelineOptimizer",
-        "generate_mask_labels",
-        "isfinite",
-        "InMemoryDataset.set_fleet_send_batch_size",
-        "cuda_profiler",
-        "unfold",
-        "Executor",
-        "InMemoryDataset.load_into_memory",
-        "ExponentialDecay",
-        "BatchNorm",
-        "deformable_conv",
-        "InMemoryDataset.preload_into_memory",
-        "py_reader",
-        "linear_lr_warmup",
-        "InMemoryDataset.wait_preload_done",
-        "CosineDecay",
-        "roi_perspective_transform",
-        "unique",
-        "ones_like",
-        "LambOptimizer",
-        "InMemoryDataset.release_memory",
-        "Conv2DTranspose",
-        "QueueDataset.local_shuffle",
-        "save_persistables@dygraph/checkpoint.py",
-        "load_persistables@dygraph/checkpoint.py",
-        "elementwise_pow",
-        "WeightedAverage.reset",
-        "ChunkEvaluator.eval",
-        "NCE.forward",
-        "elementwise_div",
-        "BilinearTensorProduct.forward",
-        "NoamDecay.step",
-        "elementwise_min",
-        "PiecewiseDecay.step",
-        "Conv3DTranspose.forward",
-        "elementwise_add",
-        "IfElse.output",
-        "IfElse.true_block",
-        "InverseTimeDecay.step",
-        "PolynomialDecay.step",
-        "Precision.eval",
-        "enabled",
-        "elementwise_max",
-        "stop_gperf_profiler",
-        "IfElse.false_block",
-        "WeightedAverage.add",
-        "Auc.trapezoid_area",
-        "elementwise_mul",
-        "GroupNorm.forward",
-        "SpectralNorm.forward",
-        "elementwise_sub",
-        "Switch.case",
-        "IfElse.input",
-        "prepare_context",
-        "PRelu.forward",
-        "Recall.update",
-        "start_gperf_profiler",
-        "TreeConv.forward",
-        "Conv2D.forward",
-        "Switch.default",
-        "elementwise_mod",
-        "Precision.update",
-        "WeightedAverage.eval",
-        "Conv3D.forward",
-        "Embedding.forward",
-        "Recall.eval",
-        "FC.forward",
-        "While.block",
-        "DGCMomentumOptimizer",
-        "ParallelEnv",
-        "spawn",
-        "init_parallel_env",
-        "DataParallel",
-        "DataParallel.scale_loss",
-        "DataParallel.apply_collective_grads",
-        "BasicLSTMCell.forward",
-        "BasicGRUCell.forward",
-        "RNN.forward",
-        "StackedRNNCell.forward",
-        "StackedLSTMCell.forward",
-        "LSTM.forward",
-        "BidirectionalRNN.forward",
-        "BidirectionalLSTM.forward",
-        "StackedGRUCell.forward",
-        "GRU.forward",
-        "BidirectionalGRU.forward",
-        "DynamicDecode.forward",
-        "Conv1dPoolLayer.forward",
-        "CNNEncoder.forward",
-        "TransformerCell.forward",
-        "TransformerBeamSearchDecoder.step",
-        "MultiHeadAttention.forward",
-        "MultiHeadAttention.cal_kv",
-        "FFN.forward",
-        "TransformerEncoderLayer.forward",
-        "TransformerEncoder.forward",
-        "TransformerDecoderLayer.forward",
-        "TransformerDecoder.forward",
-        "TransformerDecoder.prepare_static_cache",
-        "TransformerDecoder.prepare_incremental_cache",
-        "LinearChainCRF.forward",
-        "CRFDecoding.forward",
-        "SequenceTagging.forward",
-        "XPUPlace",
-        "is_compiled_with_xpu",
-        "xpu_places"
-    ],
-    "gpu_not_white":[
-        "deformable_conv",
-        "cuda_places",
-        "CUDAPinnedPlace",
-        "CUDAPlace",
-        "cuda_profiler",
-        "DGCMomentumOptimizer"
-    ]
-}