Merge https://github.com/PaddlePaddle/Paddle into develop

836f341d · jingqinghe · 9cf7ddc8 · 9373cf5a · 836f341d · 836f341d
236 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -272,7 +272,7 @@ cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatib

 cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
-cc_library(generator SRCS generator.cc)
+cc_library(generator SRCS generator.cc DEPS enforce place)

 # Get the current working branch
 execute_process(

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -116,6 +116,8 @@ void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
      return platform::to_void_cast(tensor.data<unsigned char>());
    case mkldnn::memory::data_type::s32:
      return platform::to_void_cast(tensor.data<int32_t>());
+    case mkldnn::memory::data_type::bf16:
+      return platform::to_void_cast(tensor.data<paddle::platform::bfloat16>());
    default:
      PADDLE_THROW(
          platform::errors::InvalidArgument("Wrong mkldnn type provided."));

--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -61,7 +61,8 @@ inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) {
      {DataTypeTrait<float>::DataType(), MKLDNNDataType::f32},
      {DataTypeTrait<int8_t>::DataType(), MKLDNNDataType::s8},
      {DataTypeTrait<uint8_t>::DataType(), MKLDNNDataType::u8},
-      {DataTypeTrait<int32_t>::DataType(), MKLDNNDataType::s32}};
+      {DataTypeTrait<int32_t>::DataType(), MKLDNNDataType::s32},
+      {DataTypeTrait<platform::bfloat16>::DataType(), MKLDNNDataType::bf16}};
  auto iter = dict.find(static_cast<int>(type));
  if (iter != dict.end()) return iter->second;
  return MKLDNNDataType::undef;
@@ -74,6 +75,9 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                               const OpKernelType& expected_kernel_type,
                               const Tensor& in, Tensor* out);
+
+void* GetDataFromTensor(const Tensor& tensor, MKLDNNDataType type);
+
 #endif

 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);

--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -43,3 +43,17 @@ TEST(DataTransform, DataLayoutFunction) {
  EXPECT_TRUE(in.layout() == paddle::framework::DataLayout::kNHWC);
  EXPECT_TRUE(in.dims() == paddle::framework::make_ddim({2, 3, 1, 2}));
 }
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(DataTransform, GetDataFromTensorDNNL) {
+  auto place = paddle::platform::CPUPlace();
+  paddle::framework::Tensor in = paddle::framework::Tensor();
+  in.mutable_data<paddle::platform::bfloat16>(
+      paddle::framework::make_ddim({2, 3, 1, 2}), place);
+
+  void* in_data =
+      paddle::framework::GetDataFromTensor(in, dnnl::memory::data_type::bf16);
+  EXPECT_EQ(in_data, paddle::platform::to_void_cast(
+                         in.data<paddle::platform::bfloat16>()));
+}
+#endif
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -95,9 +95,10 @@ void DatasetImpl<T>::SetHdfsConfig(const std::string& fs_name,
                                   const std::string& fs_ugi) {
  fs_name_ = fs_name;
  fs_ugi_ = fs_ugi;
-  std::string cmd = std::string("hadoop fs");
+  std::string cmd = std::string("$HADOOP_HOME/bin/hadoop fs");
  cmd += " -D fs.default.name=" + fs_name;
  cmd += " -D hadoop.job.ugi=" + fs_ugi;
+  cmd += " -Ddfs.client.block.write.retries=15 -Ddfs.rpc.timeout=500000";
  paddle::framework::hdfs_set_command(cmd);
 }


--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -18,6 +18,7 @@
 #include <unordered_map>

 using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"

 namespace paddle {
@@ -36,15 +38,16 @@ struct DataTypeTrait<void> {
 #define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \
  callback(cpp_type, ::paddle::framework::proto::VarType::proto_type);

-#define _ForEachDataType_(callback)                                     \
-  _ForEachDataTypeHelper_(callback, float, FP32);                       \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \
-  _ForEachDataTypeHelper_(callback, double, FP64);                      \
-  _ForEachDataTypeHelper_(callback, int, INT32);                        \
-  _ForEachDataTypeHelper_(callback, int64_t, INT64);                    \
-  _ForEachDataTypeHelper_(callback, bool, BOOL);                        \
-  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                    \
-  _ForEachDataTypeHelper_(callback, int16_t, INT16);                    \
+#define _ForEachDataType_(callback)                                      \
+  _ForEachDataTypeHelper_(callback, float, FP32);                        \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);  \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \
+  _ForEachDataTypeHelper_(callback, double, FP64);                       \
+  _ForEachDataTypeHelper_(callback, int, INT32);                         \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                     \
+  _ForEachDataTypeHelper_(callback, bool, BOOL);                         \
+  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                     \
+  _ForEachDataTypeHelper_(callback, int16_t, INT16);                     \
  _ForEachDataTypeHelper_(callback, int8_t, INT8)

 #define _ForEachDataTypeSmall_(callback)           \

--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -38,3 +38,25 @@ TEST(DataType, float16) {
  std::string type = "::paddle::platform::float16";
  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
 }
+
+TEST(DataType, bfloat16) {
+  using paddle::framework::Tensor;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::bfloat16;
+  namespace f = paddle::framework;
+  f::proto::VarType::Type dtype = f::proto::VarType::BF16;
+
+  Tensor tensor;
+  CPUPlace cpu;
+  tensor.mutable_data(cpu, dtype);
+
+  // test bf16 tensor
+  EXPECT_EQ(tensor.type(), f::ToDataType(typeid(bfloat16)));
+
+  // test bf16 size
+  EXPECT_EQ(f::SizeOfType(dtype), 2u);
+
+  // test debug info
+  std::string type = "::paddle::platform::bfloat16";
+  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
+}
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -77,6 +77,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
      framework::VisitDataType(dst_type,
                               CastDataType<platform::float16>(in, out, ctx));
      break;
+    case proto::VarType::BF16:
+      framework::VisitDataType(dst_type,
+                               CastDataType<platform::bfloat16>(in, out, ctx));
+      break;
    case proto::VarType::FP32:
      framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
      break;

--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -24,6 +24,11 @@ TEST(DataTypeTransform, CPUTransform) {
      paddle::framework::DataLayout::kAnyLayout,
      paddle::framework::LibraryType::kPlain);

+  auto kernel_bf16 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::BF16, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
  auto kernel_fp32 = paddle::framework::OpKernelType(
      paddle::framework::proto::VarType::FP32, place,
      paddle::framework::DataLayout::kAnyLayout,
@@ -189,4 +194,120 @@ TEST(DataTypeTransform, CPUTransform) {
                static_cast<paddle::platform::float16>(in_data_bool[i]).x);
    }
  }
+
+  // data type transform from/to bfloat16
+  {
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor out;
+
+    paddle::platform::bfloat16* ptr =
+        in.mutable_data<paddle::platform::bfloat16>(
+            paddle::framework::make_ddim({2, 3}), place);
+    int data_number = 2 * 3;
+
+    for (int i = 0; i < data_number; ++i) {
+      ptr[i] = i;
+    }
+
+    // transform from bfloat16 to other data types
+    paddle::framework::TransDataType(kernel_bf16, kernel_fp32, in, &out);
+    float* out_data_float = out.data<float>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_fp64, in, &out);
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_int32, in, &out);
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_int64, in, &out);
+    int64_t* out_data_int64 = out.data<int64_t>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_bool, in, &out);
+    bool* out_data_bool = out.data<bool>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+    }
+
+    // transform float to bfloat16
+    float* in_data_float =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_float[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_fp32, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_float[i]).x);
+    }
+
+    // transform double to bfloat16
+    double* in_data_double =
+        in.mutable_data<double>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_double[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_fp64, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_double[i]).x);
+    }
+
+    // transform int to bfloat16
+    int* in_data_int =
+        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_int32, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_int[i]).x);
+    }
+
+    // transform int64 to bfloat16
+    int64_t* in_data_int64 =
+        in.mutable_data<int64_t>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int64[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_int64, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_int64[i]).x);
+    }
+
+    // transform bool to bfloat16
+    bool* in_data_bool =
+        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_bool[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_bool, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_bool[i]).x);
+    }
+  }
 }
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -3,6 +3,7 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context

 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(fetch_async_op_handle SRCS fetch_async_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)

 cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry) 
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
@@ -98,7 +99,7 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
 #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
 #        device_context reduce_op_handle )
 cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
-        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
+        DEPS fetch_async_op_handle ssa_graph_executor scope simple_threadpool device_context)
 cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)

 cc_test(exception_holder_test SRCS exception_holder_test.cc )

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -18,7 +18,8 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -120,6 +121,11 @@ FetchResultType FastThreadedSSAGraphExecutor::Run(
  }
  // Wait FetchOps.
  ClearFetchOp(graph_, &fetch_ops);
+
+  for (auto &place : places_) {
+    fetch_ctxs_.Get(place)->Wait();
+  }
+
  return fetches;
 }

@@ -162,8 +168,8 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(

    ir::Node *fetch_node =
        graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation);
-    auto *op = new FetchOpHandle(fetch_node, fetches, i, &local_scopes_,
-                                 &local_exec_scopes_, return_merged);
+    auto *op = new FetchAsyncOpHandle(fetch_node, fetches, i, &local_scopes_,
+                                      &local_exec_scopes_, return_merged);
    fetch_ops->emplace_back(op);

    for (auto &p : places_) {
@@ -174,6 +180,14 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(
      op->AddInput(var);
    }

+    for (auto *var : vars) {
+      auto *op = var->GeneratedOp();
+      auto *compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
+      if (compute_op) {
+        compute_op->SetLockAndRecordEventFree(false);
+      }
+    }
+
    int dep = static_cast<int>(op->NotReadyInputSize());
    (*op_deps)[op] = dep;
    if (dep == 0) {
@@ -261,7 +275,7 @@ void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
 const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }

 void FastThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) {
-  if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchOpHandle *>(op)) {
+  if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchAsyncOpHandle *>(op)) {
    traced_ops_.emplace_back(op);
  }
 }

--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+FetchAsyncOpHandle::FetchAsyncOpHandle(ir::Node *node, FetchResultType *data,
+                                       size_t offset,
+                                       std::vector<Scope *> *local_scopes,
+                                       std::vector<Scope *> *local_exec_scopes,
+                                       bool return_merged)
+    : OpHandleBase(node),
+      data_(data),
+      offset_(offset),
+      local_scopes_(local_scopes),
+      local_exec_scopes_(local_exec_scopes),
+      return_merged_(return_merged) {}
+
+FetchAsyncOpHandle::~FetchAsyncOpHandle() {}
+
+void FetchAsyncOpHandle::RecordWaitEventOnCtx(
+    platform::DeviceContext *waited_ctx) {
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "No nodes need to wait FetchAsyncOp. Unexpceted Error."));
+}
+
+static void CheckTensorAttrs(const LoDTensor *tensor,
+                             const proto::VarType::Type &type,
+                             const DataLayout &layout, const DDim &dims,
+                             const LoD &lod, const size_t offset) {
+  if (tensor->numel() && tensor->IsInitialized()) {
+    // step1: check type
+    PADDLE_ENFORCE_EQ(
+        type, tensor->type(),
+        platform::errors::InvalidArgument(
+            "The data type of fetched Tensors or the items of fetched "
+            "LoDTensorArray are different from each other on different "
+            "devices(%s vs %s). And the error is caused by the %zu "
+            "(th) fetched variable. Please set the "
+            "parameter `return_merged = False` when you "
+            "call the `Executor.run()` method.",
+            DataTypeToString(type), DataTypeToString(tensor->type()), offset));
+
+    // step2: check layout
+    PADDLE_ENFORCE_EQ(
+        layout, tensor->layout(),
+        platform::errors::InvalidArgument(
+            "The layout of fetched Tensors or the items of fetched "
+            "LoDTensorArray are different from each other on different "
+            "devices(%s vs %s). And the error is caused by the %zu "
+            "(th) fetched variable. Please set the "
+            "parameter `return_merged = False` when you "
+            "call the `Executor.run()` method.",
+            DataLayoutToString(layout), DataLayoutToString(tensor->layout()),
+            offset));
+  }
+
+  // step3: check dims
+  auto tensor_dims = tensor->dims();
+  PADDLE_ENFORCE_EQ(dims.size(), tensor_dims.size(),
+                    platform::errors::InvalidArgument(
+                        "The dimension sizes of fetched Tensors or "
+                        "the items of fetched LoDTensorArray are "
+                        "different from each other on different "
+                        "devices(%s vs %s). And the error is caused by the %zu "
+                        "(th) fetched variable. Please set the "
+                        "parameter `return_merged = False` when you "
+                        "call the `Executor.run()` method.",
+                        dims, tensor_dims, offset));
+  for (int j = 1; j < dims.size(); j++) {
+    PADDLE_ENFORCE_EQ(dims[j], tensor_dims[j],
+                      platform::errors::InvalidArgument(
+                          "The dimensions of fetched Tensors or "
+                          "the items of fetched LoDTensorArray are "
+                          "different from each other on different "
+                          "devices(%s vs %s). And the error is caused by the "
+                          "%zu (th) fetched variable. Please set the "
+                          "parameter `return_merged = False` when "
+                          "you call the `Executor.run()` method.",
+                          dims, tensor_dims, offset));
+  }
+
+  // step4: check lod
+  PADDLE_ENFORCE_EQ(
+      lod.size(), tensor->lod().size(),
+      platform::errors::InvalidArgument(
+          "The LoD information of fetched Tensors or the items of fetched "
+          "LoDTensorArray are different from each other on different "
+          "devices(%s vs %s). And the error is caused by the %zu "
+          "(th) fetched variable. Please set the "
+          "parameter `return_merged = False` when you "
+          "call the `Executor.run()` method.",
+          lod, tensor->lod(), offset));
+}
+
+static void TransData(const framework::Tensor *src_item,
+                      framework::Tensor *dst_item,
+                      const platform::DeviceContext &ctx) {
+  if (src_item->IsInitialized() && src_item->numel() > 0) {
+    if (platform::is_gpu_place(src_item->place())) {
+#ifdef PADDLE_WITH_CUDA
+      TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
+#endif
+    } else {
+      TensorCopy(*src_item, platform::CPUPlace(), dst_item);
+    }
+  }
+}
+
+void FetchAsyncOpHandle::FetchMergedLodTensor(
+    const std::vector<const LoDTensor *> &src_lodtensors,
+    LoDTensor *dst_lodtensor) {
+  // calc dst type,layout,dim,lod and calc check dim
+  proto::VarType::Type new_type = proto::VarType::FP32;
+  framework::DataLayout new_layout;
+  framework::DDim new_dim;
+  LoD new_lod = src_lodtensors[0]->lod();
+
+  framework::DDim check_dim;
+
+  for (auto *t : src_lodtensors) {
+    if (t->numel() && t->IsInitialized()) {
+      check_dim = t->dims();
+      new_type = t->type();
+      new_layout = t->layout();
+      break;
+    }
+  }
+
+  bool find_first_dims = false;
+  for (auto *t : src_lodtensors) {
+    if (t->numel() && t->IsInitialized()) {
+      if (!find_first_dims) {
+        new_dim = t->dims();
+        find_first_dims = true;
+      } else {
+        new_dim[0] += t->dims()[0];
+      }
+    }
+  }
+
+  // check src type,layout,dim,lod consistence
+  for (size_t i = 1; i < src_lodtensors.size(); ++i) {
+    CheckTensorAttrs(src_lodtensors[i], new_type, new_layout, check_dim,
+                     new_lod, offset_);
+  }
+
+  // set dst tensor
+  dst_lodtensor->Resize(new_dim);
+  dst_lodtensor->set_layout(src_lodtensors[0]->layout());
+  dst_lodtensor->set_lod(src_lodtensors[0]->lod());
+  if (platform::is_gpu_place(src_lodtensors[0]->place())) {
+    dst_lodtensor->mutable_data(platform::CUDAPinnedPlace(),
+                                src_lodtensors[0]->type());
+  } else {
+    dst_lodtensor->mutable_data(platform::CPUPlace(),
+                                src_lodtensors[0]->type());
+  }
+
+  // slice and memcpy
+  int begin = 0;
+  for (auto *src : src_lodtensors) {
+    int end = begin + src->dims()[0];
+    if (end == begin) {
+      continue;
+    }
+    auto dst = dst_lodtensor->Slice(begin, end);
+    TransData(src, &dst, *dev_ctxes_[src->place()]);
+    begin = end;
+  }
+}
+
+void FetchAsyncOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name());
+  WaitInputVarGenerated();
+
+  // get src vars
+  auto &scopes = *local_exec_scopes_;
+  std::vector<Variable *> src_vars;
+  src_vars.reserve(inputs_.size());
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
+    auto &scope = scopes.at(var_handle->scope_idx());
+    auto *var = scope->FindVar(var_handle->name());
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::NotFound(
+            "Cannot find variable %s in execution scope.", var_handle->name()));
+    src_vars.emplace_back(var);
+  }
+
+  if (return_merged_) {
+    auto &val = BOOST_GET(FetchList, *data_);
+    if (src_vars[0]->IsType<LoDTensor>()) {
+      // to lodtensor type
+      std::vector<const LoDTensor *> src_lodtensors;
+      src_lodtensors.reserve(src_vars.size());
+      for (size_t i = 0; i < src_vars.size(); ++i) {
+        src_lodtensors.emplace_back(&src_vars[i]->Get<framework::LoDTensor>());
+      }
+
+      LoDTensor dst_lodtensor;
+      FetchMergedLodTensor(src_lodtensors, &dst_lodtensor);
+      val.at(offset_) = std::move(dst_lodtensor);
+    } else {
+      // to lodtensorarray type
+      std::vector<const LoDTensorArray *> src_lodtensor_arrays;
+      src_lodtensor_arrays.reserve(src_vars.size());
+      for (size_t i = 0; i < src_vars.size(); ++i) {
+        src_lodtensor_arrays.emplace_back(
+            &src_vars[i]->Get<framework::LoDTensorArray>());
+      }
+
+      LoDTensorArray dst_lodtensor_array;
+      dst_lodtensor_array.resize(src_lodtensor_arrays[0]->size());
+
+      for (size_t i = 0; i < dst_lodtensor_array.size(); ++i) {
+        std::vector<const LoDTensor *> src_lodtensors;
+        src_lodtensors.reserve(src_lodtensor_arrays.size());
+        for (size_t j = 0; j < src_lodtensor_arrays.size(); ++j) {
+          src_lodtensors.emplace_back(&(*src_lodtensor_arrays[j])[i]);
+        }
+        FetchMergedLodTensor(src_lodtensors, &dst_lodtensor_array[i]);
+      }
+      val.at(offset_) = std::move(dst_lodtensor_array);
+    }
+  } else {
+    auto &val = BOOST_GET(FetchUnmergedList, *data_);
+    auto &dst_tensors = val.at(offset_);
+    dst_tensors.reserve(src_vars.size());
+
+    for (size_t i = 0; i < src_vars.size(); ++i) {
+      if (src_vars[i]->IsType<LoDTensor>()) {
+        auto &t = src_vars[i]->Get<framework::LoDTensor>();
+        LoDTensor item;
+        TransData(&t, &item, *dev_ctxes_[t.place()]);
+        dst_tensors.emplace_back(std::move(item));
+      } else {
+        auto &t = src_vars[i]->Get<framework::LoDTensorArray>();
+        LoDTensorArray item;
+        item.resize(t.size());
+        for (size_t j = 0; j < t.size(); ++j) {
+          TransData(&t[j], &item[j], *dev_ctxes_[t[j].place()]);
+        }
+        dst_tensors.emplace_back(std::move(item));
+      }
+    }
+  }
+}
+
+bool FetchAsyncOpHandle::IsMultiDeviceTransfer() { return true; }
+
+std::string FetchAsyncOpHandle::Name() const { return "FetchAsync"; }
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/fetch_async_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.h
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct FetchAsyncOpHandle : public OpHandleBase {
+ public:
+  FetchAsyncOpHandle(ir::Node *node, FetchResultType *data, size_t offset,
+                     std::vector<Scope *> *local_scopes,
+                     std::vector<Scope *> *local_exec_scopes,
+                     bool return_merged);
+
+  ~FetchAsyncOpHandle();
+
+  void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override;
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override;
+
+ protected:
+  void RunImpl() override;
+
+  std::vector<Scope *> GetLocalScopes() override { return *local_scopes_; }
+
+  void FetchMergedLodTensor(
+      const std::vector<const LoDTensor *> &src_lodtensors,
+      LoDTensor *dst_lodtensor);
+
+ private:
+  FetchResultType *data_;
+  size_t offset_;
+  std::vector<Scope *> *local_scopes_;
+  std::vector<Scope *> *local_exec_scopes_;
+  bool return_merged_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -36,7 +36,8 @@ FetchOpHandle::FetchOpHandle(ir::Node *node, FetchResultType *data,
 FetchOpHandle::~FetchOpHandle() {}

 void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
-  PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "No nodes need to wait FetchOp. Unexpceted Error."));
 }

 static void CheckDims(const framework::DDim &tensor_dims,

--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -167,6 +167,8 @@ static void PrintNanInf(const T* value, const size_t numel, int print_num,
 // more detail see: 180 page of
 // https://www.openmp.org/wp-content/uploads/OpenMP4.0.0.pdf
 #pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in)
+#pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \
+                              omp_in)
 #endif

 template <typename T>
@@ -205,6 +207,21 @@ void CheckNanInf<paddle::platform::float16>(
    PrintNanInf(value, numel, print_num, op_type, var_name);
  }
 }
+
+template <>
+void CheckNanInf<paddle::platform::bfloat16>(
+    const paddle::platform::bfloat16* value, const size_t numel, int print_num,
+    const std::string& op_type, const std::string& var_name) {
+  float sum = 0.0f;
+#pragma omp parallel for reduction(+ : sum)
+  for (size_t i = 0; i < numel; ++i) {
+    sum += static_cast<float>(value[i] - value[i]);
+  }
+
+  if (std::isnan(sum) || std::isinf(sum)) {
+    PrintNanInf(value, numel, print_num, op_type, var_name);
+  }
+}
 #endif

 template <>

--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
+#include "paddle/fluid/framework/details/fetch_async_op_handle.h"

 namespace paddle {
 namespace framework {
@@ -23,9 +24,11 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
  if (fetch_ops->empty()) return;

  for (auto& op : *fetch_ops) {
-    PADDLE_ENFORCE_NOT_NULL(
-        dynamic_cast<FetchOpHandle*>(op),
-        "The input ops of ClearFetchOp function should be FetchOpHandle.");
+    PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
+                          dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
+                      true,
+                      "The input ops of ClearFetchOp function should be "
+                      "FetchOpHandle or FetchAsyncOpHandle.");
    for (auto& out_var : op->Node()->outputs) {
      graph->RemoveNode(out_var);
    }

--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -23,6 +23,7 @@ template <typename T>
 static ::DLDataType GetDLDataTypeCode() {
  ::DLDataType dtype;
  if (std::is_same<T, platform::float16>::value ||
+      std::is_same<T, platform::bfloat16>::value ||
      std::is_floating_point<T>::value) {
    dtype.code = kDLFloat;
  } else if (std::is_unsigned<T>::value) {

--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -857,7 +857,7 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
    float* g = g_tensor->data<float>();

    if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) {
-      int dim = emb_dim + offset;
+      int dim = emb_dim;
      Eigen::Map<
          Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
          g_mat(g, g_tensor->numel() / dim, dim);

--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -21,10 +21,46 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
+#include <vector>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"

 namespace paddle {
 namespace framework {

+const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
+#ifdef PADDLE_WITH_CUDA
+
+  static int64_t num_cuda_devices = -1;
+  static std::once_flag num_devices_init_flag;
+  static std::deque<std::once_flag> cuda_device_flags;
+  static std::vector<std::shared_ptr<Generator>> default_cuda_generators;
+
+  std::call_once(num_devices_init_flag, []() {
+    num_cuda_devices = paddle::platform::GetCUDADeviceCount();
+    cuda_device_flags.resize(num_cuda_devices);
+    default_cuda_generators.resize(num_cuda_devices);
+  });
+  if (device_id < 0) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "cuda device id shoule be greater than 0"));
+  }
+
+  std::call_once(cuda_device_flags[device_id], [device_id]() {
+    default_cuda_generators[device_id] =
+        std::make_shared<Generator>(GetRandomSeed(), device_id);
+    VLOG(4) << "initial seed: "
+            << default_cuda_generators[device_id]->GetCurrentSeed();
+  });
+  return default_cuda_generators[device_id];
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "getDefaultCUDAGenerator only support in CUDA place"));
+#endif
+}
+
 const std::shared_ptr<Generator>& DefaultCPUGenerator() {
  static auto default_cpu_generator =
      std::make_shared<Generator>(GetRandomSeed());
@@ -103,6 +139,7 @@ uint64_t Generator::Seed() {
 void Generator::SetCurrentSeed(uint64_t seed) {
  std::lock_guard<std::mutex> lock(this->mu_);
  this->state_.current_seed = seed;
+  this->state_.thread_offset = 0;
  std::seed_seq seq({seed});
  this->engine_->seed(seq);
 }
@@ -123,6 +160,22 @@ uint64_t Generator::Random64() {
  return (*engine)();
 }

+std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
+    uint64_t increament_offset) {
+  uint64_t cur_offset = this->state_.thread_offset;
+#ifdef PADDLE_WITH_CUDA
+  std::lock_guard<std::mutex> lock(this->mu_);
+
+  this->state_.thread_offset += increament_offset;
+
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "Increment Offset only support in CUDA place"));
+#endif
+  return std::make_pair(static_cast<int>(this->state_.current_seed),
+                        cur_offset);
+}
+
 void Generator::SetIsInitPy(bool is_init_py) {
  this->is_init_py_ = is_init_py;
  VLOG(4) << "SetIsInitPy:" << this->is_init_py_;

--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -38,6 +38,7 @@ static uint64_t GetRandomSeed() {
 struct GeneratorState {
  int64_t device = -1;
  uint64_t current_seed = 34342423252;
+  uint64_t thread_offset = 0;
  std::mt19937_64 cpu_engine;
 };

@@ -49,6 +50,7 @@ struct Generator {
    this->state_.cpu_engine = *engine;
    this->state_.device = -1;
    this->state_.current_seed = seed;
+    this->state_.thread_offset = 0;
    this->engine_ = engine;
    VLOG(4) << "initial seed: " << this->state_.current_seed
            << ", cpu engine: " << &this->state_.cpu_engine;
@@ -59,11 +61,25 @@ struct Generator {
    this->state_.cpu_engine = *engine;
    this->state_.device = -1;
    this->state_.current_seed = seed;
+    this->state_.thread_offset = 0;
    this->engine_ = engine;
    VLOG(4) << "initial seed: " << this->state_.current_seed
            << ", cpu engine: " << &this->state_.cpu_engine;
    this->is_init_py_ = true;  // TODO(zhiqiu): remove it in future
  }
+  Generator(uint64_t seed, uint64_t device_id) {
+    std::seed_seq seq({seed});
+    auto engine = std::make_shared<std::mt19937_64>(seq);
+    this->state_.cpu_engine = *engine;
+    this->state_.device = device_id;
+    this->state_.current_seed = seed;
+    this->state_.thread_offset = 0;
+    this->engine_ = engine;
+    VLOG(4) << "initial seed: " << this->state_.current_seed
+            << ", cpu engine: " << &this->state_.cpu_engine;
+    this->is_init_py_ = false;  // TODO(zhiqiu): remove it in future
+  }
+
  Generator(const Generator& other) = delete;

  // get random state
@@ -83,8 +99,11 @@ struct Generator {

  uint64_t Random64();

+  std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t increament_offset);
+
  void SetIsInitPy(bool);
  bool GetIsInitPy() const;
+  uint64_t get_device_id() { return this->state_.device; }

 private:
  GeneratorState state_;
@@ -105,5 +124,8 @@ std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine();

 std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t);

+const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(
+    int64_t device_id = -1);
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -133,6 +133,9 @@ class OpVersion {
    checkpoints_.push_back(Checkpoint({note, op_version_desc}));
    return *this;
  }
+  uint32_t GetVersionID() const {
+    return static_cast<uint32_t>(checkpoints_.size());
+  }

 private:
  struct Checkpoint {
@@ -156,6 +159,14 @@ class OpVersionRegistrar {
    op_version_map_.insert({op_type, OpVersion()});
    return op_version_map_[op_type];
  }
+  uint32_t GetVersionID(const std::string& op_type) const {
+    auto it = op_version_map_.find(op_type);
+    if (it == op_version_map_.end()) {
+      return 0;
+    }
+
+    return it->second.GetVersionID();
+  }

 private:
  std::unordered_map<std::string, OpVersion> op_version_map_;
@@ -164,6 +175,125 @@ class OpVersionRegistrar {
  OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
 };

+class OpVersionComparator {
+ public:
+  virtual bool operator()() = 0;
+  virtual ~OpVersionComparator() = default;
+};
+
+#define ADD_OP_VERSION_COMPARATOR(cmp_name, cmp_math)                   \
+  class OpVersion##cmp_name##Comparator : public OpVersionComparator {  \
+   public:                                                              \
+    explicit OpVersion##cmp_name##Comparator(const std::string op_name, \
+                                             uint32_t target_version)   \
+        : op_name_(op_name), target_version_(target_version) {}         \
+    virtual bool operator()() {                                         \
+      return OpVersionRegistrar::GetInstance().GetVersionID(op_name_)   \
+          cmp_math target_version_;                                     \
+    }                                                                   \
+    virtual ~OpVersion##cmp_name##Comparator() {}                       \
+                                                                        \
+   private:                                                             \
+    std::string op_name_;                                               \
+    uint32_t target_version_;                                           \
+  };
+
+ADD_OP_VERSION_COMPARATOR(LE, <=);
+ADD_OP_VERSION_COMPARATOR(EQ, ==);
+ADD_OP_VERSION_COMPARATOR(GE, >=);
+ADD_OP_VERSION_COMPARATOR(NE, !=);
+
+class OpVersionComparatorCombination {
+ public:
+  OpVersionComparatorCombination() {}
+
+  OpVersionComparatorCombination& LE(const std::string& op_name,
+                                     int target_version) {
+    op_version_comparators_.push_back(std::shared_ptr<OpVersionComparator>(
+        new OpVersionLEComparator(op_name, target_version)));
+    return *this;
+  }
+  OpVersionComparatorCombination& EQ(const std::string& op_name,
+                                     int target_version) {
+    op_version_comparators_.push_back(std::shared_ptr<OpVersionComparator>(
+        new OpVersionEQComparator(op_name, target_version)));
+    return *this;
+  }
+  OpVersionComparatorCombination& GE(const std::string& op_name,
+                                     int target_version) {
+    op_version_comparators_.push_back(std::shared_ptr<OpVersionComparator>(
+        new OpVersionGEComparator(op_name, target_version)));
+    return *this;
+  }
+  OpVersionComparatorCombination& NE(const std::string& op_name,
+                                     int target_version) {
+    op_version_comparators_.push_back(std::shared_ptr<OpVersionComparator>(
+        new OpVersionNEComparator(op_name, target_version)));
+    return *this;
+  }
+
+  bool IsMatched() const {
+    for (const auto& cmp : op_version_comparators_) {
+      if (!(*cmp)()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  std::vector<std::shared_ptr<OpVersionComparator>> op_version_comparators_;
+};
+
+class PassVersionCheckers {
+ public:
+  PassVersionCheckers& AddCombination(
+      const OpVersionComparatorCombination& combinations) {
+    pass_version_checkers_.push_back(combinations);
+    return *this;
+  }
+  bool IsPassCompatible() const {
+    if (pass_version_checkers_.empty()) {
+      return true;
+    }
+    for (const auto& checker : pass_version_checkers_) {
+      if (checker.IsMatched()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  std::vector<OpVersionComparatorCombination> pass_version_checkers_;
+};
+
+class PassVersionCheckerRegistrar {
+ public:
+  static PassVersionCheckerRegistrar& GetInstance() {
+    static PassVersionCheckerRegistrar instance;
+    return instance;
+  }
+  PassVersionCheckers& Register(const std::string& pass_name) {
+    return pass_version_checkers_map_[pass_name];
+  }
+  bool IsPassCompatible(const std::string& fuse_pass_name) const {
+    auto iter = pass_version_checkers_map_.find(fuse_pass_name);
+    if (iter == pass_version_checkers_map_.end()) {
+      return true;
+    }
+    return iter->second.IsPassCompatible();
+  }
+
+ private:
+  std::unordered_map<std::string, PassVersionCheckers>
+      pass_version_checkers_map_;
+
+  PassVersionCheckerRegistrar() = default;
+  PassVersionCheckerRegistrar& operator=(const PassVersionCheckerRegistrar&) =
+      delete;
+};
+
 }  // namespace compatible
 }  // namespace framework
 }  // namespace paddle
@@ -173,3 +303,9 @@ class OpVersionRegistrar {
      RegisterOpVersion__##op_type =                                       \
          paddle::framework::compatible::OpVersionRegistrar::GetInstance() \
              .Register(#op_type)
+
+#define REGISTER_PASS_CAPABILITY(pass_name)                        \
+  static auto RegisterOpPassVersionChecker__##pass_name =          \
+      paddle::framework::compatible::PassVersionCheckerRegistrar:: \
+          GetInstance()                                            \
+              .Register(#pass_name)
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -55,6 +55,72 @@ TEST(test_operator_version, test_operator_version) {
              .NewInput("X2", "The second input.")
              .NewOutput("Y2", "The second output."));
 }
+
+TEST(test_pass_op_version_checker, test_pass_op_version_checker) {
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "no_bind_pass"));
+
+  REGISTER_PASS_CAPABILITY(test_pass1)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .LE("mul", 1)
+              .EQ("fc", 0));
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass1"));
+
+  REGISTER_PASS_CAPABILITY(test_pass2)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .GE("mul", 0)
+              .NE("fc", 0));
+  ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass2"));
+
+  REGISTER_PASS_CAPABILITY(test_pass3)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .GE("mul", 0)
+              .NE("fc", 0))
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .LE("mul", 1)
+              .EQ("fc", 0));
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass3"));
+
+  REGISTER_PASS_CAPABILITY(test_pass4)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .GE("test__", 5)
+              .EQ("fc", 0));
+  ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass4"));
+
+  REGISTER_PASS_CAPABILITY(test_pass5)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .GE("test__", 4)
+              .EQ("fc", 0));
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass5"));
+
+  REGISTER_PASS_CAPABILITY(test_pass6)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .EQ("test__", 4)
+              .EQ("fc", 0));
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass6"));
+
+  REGISTER_PASS_CAPABILITY(test_pass7)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .NE("test__", 4)
+              .EQ("fc", 0));
+  ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass7"));
+}
+
 }  // namespace compatible
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -90,32 +90,6 @@ void MemoryOptimizePass::CollectLifeCycle(
  }
 }

-// TODO(Superjomn) Make this a general help method.
-int DataTypeToSpace(framework::proto::VarType_Type type) {
-  switch (type) {
-    case framework::proto::VarType_Type_BOOL:
-      return sizeof(bool);
-    case framework::proto::VarType_Type_FP32:
-      return sizeof(float);
-    case framework::proto::VarType_Type_INT32:
-      return sizeof(int32_t);
-    case framework::proto::VarType_Type_INT64:
-      return sizeof(int64_t);
-    case framework::proto::VarType_Type_INT16:
-      return sizeof(int16_t);
-    case framework::proto::VarType_Type_FP16:
-      return sizeof(int16_t);
-    case framework::proto::VarType_Type_FP64:
-      return sizeof(double);
-    case framework::proto::VarType_Type_UINT8:
-      return sizeof(unsigned char);
-    case framework::proto::VarType_Type_INT8:
-      return sizeof(int8_t);
-    default:
-      PADDLE_THROW("Unknown data type");
-  }
-}
-
 void MemoryOptimizePass::CollectVarMemorySize(
    space_table_t* space_table) const {
  const int fake_batch_size = 1;
@@ -163,7 +137,7 @@ void MemoryOptimizePass::CollectVarMemorySize(
      int size = std::accumulate(shape.begin(), shape.end(), 1,
                                 std::multiplies<int>());
      (*space_table)[node->Var()->Name()] =
-          size * DataTypeToSpace(node->Var()->GetDataType());
+          size * paddle::framework::SizeOfType(node->Var()->GetDataType());
    }
  }
 }

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1058,6 +1058,7 @@ USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
 USE_TRT_CONVERTER(skip_layernorm);
 USE_TRT_CONVERTER(slice);
 USE_TRT_CONVERTER(scale);
+USE_TRT_CONVERTER(stack);
 #endif

 namespace paddle_infer {

--- a/paddle/fluid/inference/lite/test_engine.cc
+++ b/paddle/fluid/inference/lite/test_engine.cc
@@ -14,15 +14,16 @@

 #include <gtest/gtest.h>

-#include "paddle/fluid/inference/lite/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/operators/lite/ut_helper.h"

 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"

+#include "paddle/fluid/inference/lite/engine.h"
+#include "paddle/fluid/operators/lite/ut_helper.h"
+
 namespace paddle {
 namespace inference {
 namespace lite {

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -3,8 +3,8 @@ nv_library(tensorrt_converter
           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc
-emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc
+                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc
+                emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc
           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)

 nv_test(test_op_converter SRCS test_op_converter.cc DEPS

--- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
@@ -58,6 +58,24 @@ class ScaleOpConverter : public OpConverter {
    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                         0};
    nvinfer1::ILayer* layer = nullptr;
+
+    auto input_dim = input->getDimensions();
+    PADDLE_ENFORCE_GE(input_dim.nbDims, 3,
+                      platform::errors::Fatal(
+                          "Paddle-TRT scale mode only support dimension >= 3"));
+
+    nvinfer1::IShuffleLayer* expand_layer = nullptr;
+    nvinfer1::IShuffleLayer* squeeze_layer = nullptr;
+
+    if (input_dim.nbDims == 3) {
+      // TensorRT scale layer is not supporting input dims < 4 when using
+      // explicit batch
+      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+      nvinfer1::Dims4 target_shape(0, 0, 0, 1);  // expand 1 dims
+      expand_layer->setReshapeDimensions(target_shape);
+      input = expand_layer->getOutput(0);
+    }
+
    if (bias_after_scale) {
      layer = TRT_ENGINE_ADD_LAYER(
          engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM,
@@ -73,6 +91,18 @@ class ScaleOpConverter : public OpConverter {
          power_weights.get(), scale_weights.get(), power_weights.get());
    }

+    PADDLE_ENFORCE_EQ(layer != nullptr, true,
+                      platform::errors::Fatal("Create scale layer failed."));
+
+    if (input_dim.nbDims == 3) {
+      // TensorRT scale layer is not supporting input dims < 4 when using
+      // explicit batch
+      squeeze_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
+      nvinfer1::Dims3 target_shape(0, 0, 0);  // expand 1 dims
+      squeeze_layer->setReshapeDimensions(target_shape);
+      layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+    }
    RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode);
  }
 };

--- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Stack converter from fluid to tensorRT.
+ */
+class StackOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert fluid stack op to tensorrt stack layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    auto input = op_desc.Input("X");
+    int input_num = input.size();
+    nvinfer1::ITensor** inputs =
+        (nvinfer1::ITensor**)malloc(input_num * sizeof(nvinfer1::ITensor*));
+
+    for (int i = 0; i < input_num; ++i) {
+      inputs[i] = engine_->GetITensor(input[i]);
+    }
+
+    int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
+    if (axis < 0) {
+      axis = axis + inputs[0]->getDimensions().nbDims + 1;
+    }
+
+    nvinfer1::ILayer* layer = nullptr;
+    if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
+      plugin::StackPluginDynamic* plugin =
+          new plugin::StackPluginDynamic(axis, input_num);
+      layer = engine_->AddPluginV2(inputs, input_num, plugin);
+      assert(layer != nullptr);
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the TRT Dynamic Shape mode, need to confirm that "
+          "your TRT version is no less than 6.0"));
+#endif
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the Ernie(Bert) model in static"
+          "shape mode, which is not supported for the time being.\n"
+          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
+          " to set the shape information to run the dynamic shape mode."));
+    }
+    auto output_name = op_desc.Output("Y").front();
+    RreplenishLayerAndOutput(layer, "stack", {output_name}, test_mode);
+    free(inputs);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(stack, StackOpConverter);
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -88,6 +88,7 @@ struct SimpleOpTypeSetTeller : public Teller {
      "gelu",
      "layer_norm",
      "scale",
+      "stack",
  };
 };


--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
 nv_library(tensorrt_plugin
           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
-           prelu_op_plugin.cu  trt_plugin_factory.cc gelu_op_plugin.cu 
+           prelu_op_plugin.cu trt_plugin_factory.cc gelu_op_plugin.cu
           pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu
-instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
-qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu hard_swish_op_plugin.cu
-           DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) 
+           instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
+           qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu
+           hard_swish_op_plugin.cu stack_op_plugin.cu
+           DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
@@ -104,32 +104,51 @@ nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions(

  auto stri_0 = expr_builder.constant(strides_[0]);
  auto stri_1 = expr_builder.constant(strides_[1]);
+  auto one_value = expr_builder.constant(1);

-  auto tmp1_0 =
-      expr_builder.constant((-ksize_[0] + 2 * paddings_[0]) / strides_[0] + 1);
-  auto tmp1_1 =
-      expr_builder.constant((-ksize_[1] + 2 * paddings_[1]) / strides_[1] + 1);
+  auto v0_tmp = expr_builder.constant(-ksize_[0] + 2 * paddings_[0]);
+  auto v1_tmp = expr_builder.constant(-ksize_[1] + 2 * paddings_[1]);

-  auto tmp2_0 = expr_builder.constant(
-      (-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1) / strides_[0] + 1);
-  auto tmp2_1 = expr_builder.constant(
-      (-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1) / strides_[1] + 1);
-
-  auto *a_d = expr_builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV,
-                                     *inputs[0].d[2], *stri_0);
-  auto *b_d = expr_builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV,
-                                     *inputs[0].d[3], *stri_1);
+  auto ceil_tmp =
+      expr_builder.constant(-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1);
+  auto ceil1_tmp =
+      expr_builder.constant(-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1);

  if (!ceil_mode_) {
-    output.d[2] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
-                                         *a_d, *tmp1_0);
-    output.d[3] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
-                                         *b_d, *tmp1_1);
+    output.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[2], *v0_tmp),
+            *stri_0),
+        *one_value);
+    output.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[3], *v1_tmp),
+            *stri_1),
+        *one_value);
+
  } else {
-    output.d[2] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
-                                         *a_d, *tmp2_0);
-    output.d[3] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
-                                         *b_d, *tmp2_1);
+    output.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[2], *ceil_tmp),
+            *stri_0),
+        *one_value);
+    output.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[3], *ceil1_tmp),
+            *stri_1),
+        *one_value);
  }

  return output;

--- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include <cstring>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+StackPluginDynamic::StackPluginDynamic(int axis, int num_stack)
+    : axis_(axis), num_stack_(num_stack) {}
+
+StackPluginDynamic::StackPluginDynamic(void const* serial_data,
+                                       size_t serial_length) {
+  DeserializeValue(&serial_data, &serial_length, &axis_);
+  DeserializeValue(&serial_data, &serial_length, &num_stack_);
+}
+
+StackPluginDynamic::~StackPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt* StackPluginDynamic::clone() const {
+  return new StackPluginDynamic(axis_, num_stack_);
+}
+
+const char* StackPluginDynamic::getPluginType() const { return "stack_plugin"; }
+
+int StackPluginDynamic::getNbOutputs() const { return 1; }
+
+int StackPluginDynamic::initialize() { return 0; }
+
+size_t StackPluginDynamic::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(axis_);
+  serialize_size += SerializedSize(num_stack_);
+  return serialize_size;
+}
+
+void StackPluginDynamic::serialize(void* buffer) const {
+  SerializeValue(&buffer, axis_);
+  SerializeValue(&buffer, num_stack_);
+}
+
+nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder) {
+  nvinfer1::DimsExprs output(inputs[0]);
+  output.nbDims = inputs[0].nbDims + 1;
+
+  for (int i = inputs[0].nbDims; i > axis_; --i) {
+    output.d[i] = inputs[0].d[i - 1];
+  }
+  output.d[axis_] = expr_builder.constant(nb_inputs);
+  return output;
+}
+
+void StackPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+
+size_t StackPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+  return num_stack_ * sizeof(uintptr_t);
+}
+
+void StackPluginDynamic::destroy() { delete this; }
+
+void StackPluginDynamic::terminate() {}
+
+bool StackPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
+    int nb_outputs) {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of stack plugin should not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+
+  const nvinfer1::PluginTensorDesc& in = in_out[pos];
+  if (pos == 0) {
+#ifdef SUPPORTS_CUDA_FP16
+    return (in.type == nvinfer1::DataType::kFLOAT ||
+            in.type == nvinfer1::DataType::kHALF) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+#else
+    return (in.type == nvinfer1::DataType::kFLOAT) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+#endif
+  }
+  const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
+  // output
+  return in.type == prev.type && in.format == prev.format;
+}
+
+nvinfer1::DataType StackPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The index should be equal to 0"));
+  return input_types[0];
+}
+
+template <typename T>
+__global__ void StackKernel(const T* const* input, T* output, int num_stack,
+                            int base_unit) {
+  int stack_id = blockIdx.x;
+  int lead_id = blockIdx.y;
+
+  for (int i = threadIdx.x; i < base_unit; i += blockDim.x) {
+    output[lead_id * num_stack * base_unit + stack_id * base_unit + i] =
+        input[stack_id][lead_id * base_unit + i];
+  }
+}
+
+int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+                                const nvinfer1::PluginTensorDesc* output_desc,
+                                const void* const* inputs, void* const* outputs,
+                                void* workspace, cudaStream_t stream) {
+  auto input_dims = input_desc[0].dims;  // (batch, seq, seq)
+  auto out_dims = output_desc[0].dims;   // (batch, num_head, seq, seq)
+  auto out_num_dims = out_dims.nbDims;
+
+  int base_unit = 1;
+  for (int i = axis_ + 1; i < out_num_dims; ++i) {
+    PADDLE_ENFORCE_GT(out_dims.d[i], 0,
+                      platform::errors::InvalidArgument(
+                          "Input dimensions should be greater than 0"));
+    base_unit *= out_dims.d[i];
+  }
+
+  int lead_unit = 1;
+  for (int i = 0; i < axis_; ++i) {
+    PADDLE_ENFORCE_GT(out_dims.d[i], 0,
+                      platform::errors::InvalidArgument(
+                          "Input dimensions should be greater than 0"));
+    lead_unit *= out_dims.d[i];
+  }
+
+  PADDLE_ENFORCE_EQ(
+      out_dims.d[axis_], num_stack_,
+      platform::errors::InvalidArgument("number of stack axis should be same"));
+
+  cudaMemcpyAsync(workspace, reinterpret_cast<const void* const>(inputs),
+                  sizeof(void*) * out_dims.d[axis_], cudaMemcpyHostToDevice,
+                  stream);
+
+  const int num_stacks = out_dims.d[axis_];
+  dim3 num_blocks(num_stacks, lead_unit);
+  const int num_threads = 256;
+  auto infer_type = input_desc[0].type;
+
+  if (infer_type == nvinfer1::DataType::kFLOAT) {
+    float* output = static_cast<float*>(outputs[0]);
+    StackKernel<float><<<num_blocks, num_threads, 0, stream>>>(
+        reinterpret_cast<const float* const*>(workspace), output, num_stacks,
+        base_unit);
+  } else if (infer_type == nvinfer1::DataType::kHALF) {
+#ifdef SUPPORTS_CUDA_FP16
+    __half* output = static_cast<__half*>(outputs[0]);
+    StackKernel<__half><<<num_blocks, num_threads, 0, stream>>>(
+        reinterpret_cast<const __half* const*>(workspace), output, num_stacks,
+        base_unit);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "The cuda archs you specific should greater than 600."));
+#endif
+  } else {
+    PADDLE_THROW(
+        platform::errors::Fatal("The Stack TRT Plugin's input type only "
+                                "support float or half currently."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+
+StackPluginDynamicCreator::StackPluginDynamicCreator() {}
+
+const char* StackPluginDynamicCreator::getPluginName() const {
+  return "stack_plugin";
+}
+
+const char* StackPluginDynamicCreator::getPluginVersion() const { return "1"; }
+
+const nvinfer1::PluginFieldCollection*
+StackPluginDynamicCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  int axis = -1;
+  int num_stack = -1;
+
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const std::string name(fc->fields[i].name);
+    if (name == "axis") {
+      axis = static_cast<const int*>(fc->fields[i].data)[0];
+    } else if (name == "num_stack") {
+      num_stack = static_cast<const int*>(fc->fields[i].data)[0];
+    } else {
+      PADDLE_THROW(platform::errors::Fatal("Meet an unknown plugin field '" +
+                                           name +
+                                           "' when creating stack op plugin."));
+    }
+  }
+  return new StackPluginDynamic(axis, num_stack);
+}
+
+nvinfer1::IPluginV2* StackPluginDynamicCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new StackPluginDynamic(serial_data, serial_length);
+  return plugin;
+}
+
+void StackPluginDynamicCreator::setPluginNamespace(const char* lib_namespace) {
+  plugin_namespace_ = lib_namespace;
+}
+
+const char* StackPluginDynamicCreator::getPluginNamespace() const {
+  return plugin_namespace_.c_str();
+}
+
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdio.h>
+#include <cassert>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class StackPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit StackPluginDynamic(int axis, int num_stack);
+  StackPluginDynamic(void const* serial_data, size_t serial_length);
+  ~StackPluginDynamic();
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  const char* getPluginType() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+
+ private:
+  int axis_;
+  int num_stack_;
+};
+
+class StackPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  StackPluginDynamicCreator();
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+
+ private:
+  std::string plugin_namespace_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(StackPluginDynamicCreator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -132,7 +132,9 @@ if(NOT APPLE AND WITH_MKLML)
    set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
    download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
    inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
-    set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
+    if(NOT WIN32)
+        set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
+    endif()
 else()
    # TODO: fix this test on MACOS and OPENBLAS, the reason is that
    # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
@@ -192,8 +194,9 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz")
 inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
    ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
-
-set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
+if(NOT WIN32 AND NOT APPLE)
+    set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
+endif()

 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
@@ -215,7 +218,7 @@ inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_test

 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
-if (NOT EXISTS ${OCR_INSTALL_DIR})
+if (NOT EXISTS ${OCR_INSTALL_DIR}/ocr.tar.gz)
    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
@@ -231,7 +234,7 @@ set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysi

 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
-if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
+if (NOT EXISTS ${MOBILENET_INSTALL_DIR}/mobilenet.tar.gz)
    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
@@ -395,15 +398,15 @@ inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert

 if(WITH_GPU AND TENSORRT_FOUND)
    set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models")
-    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
+    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models.tar.gz)
        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz")
    endif()
    set(TEST_SPLIT_CONVERTER_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_split_op_converter_test")
-    if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL})
+    if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL}/split_converter.tgz)
        inference_download_and_uncompress(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz")
    endif()
    set(TEST_INSTANCE_NORM_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_instance_norm_test")
-    if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL})
+    if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL}/instance_norm.tgz)
        inference_download_and_uncompress(${TEST_INSTANCE_NORM_MODEL} ${INFERENCE_URL}/tensorrt_test "instance_norm.tgz")
    endif()
    inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc
@@ -432,7 +435,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
     
    set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR})
+    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz)
        inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
    endif()
    inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc
@@ -440,7 +443,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
            ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})

    set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR})
+    if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR}/yolov3_r50_quant_aware.tgz)
        inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz")
    endif()
    inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc
@@ -448,12 +451,12 @@ if(WITH_GPU AND TENSORRT_FOUND)
            ARGS --infer_model=${TRT_MODEL_QUANT_YOLOV3_DIR})

    set(TEST_TRT_DYNAMIC_MODEL2 "${TRT_MODEL_INSTALL_DIR}/complex_model_dynamic")
-    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2})
+    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2}/complex_model_dynamic2.tar.gz)
        inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL2} ${INFERENCE_URL}/tensorrt_test "complex_model_dynamic2.tar.gz")
    endif()

    set(TEST_TRT_DYNAMIC_MODEL "${TRT_MODEL_INSTALL_DIR}/conv_bn_swish_split_gelu")
-    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL})
+    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL}/conv_bn_swish_split_gelu.tar.gz)
        inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test "conv_bn_swish_split_gelu.tar.gz")
    endif()
    inference_analysis_test(trt_dynamic_shape_test SRCS trt_dynamic_shape_test.cc
@@ -461,7 +464,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR})

    set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL})
+    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4.tar.gz)
        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4.tar.gz")
    endif()

@@ -470,7 +473,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)

    set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL})
+    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz)
        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
    endif()


--- a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
@@ -54,9 +54,6 @@ TEST(PD_AnalysisConfig, use_gpu) {
  PD_SwitchIrOptim(config, true);
  bool ir_optim = PD_IrOptim(config);
  CHECK(ir_optim) << "NO";
-  PD_EnableMkldnnBfloat16(config);
-  bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
-  CHECK(!bfloat16_enable) << "NO";
  PD_EnableTensorRtEngine(config, 1 << 20, 1, 3, Precision::kFloat32, false,
                          false);
  bool trt_enable = PD_TensorrtEngineEnabled(config);

--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -90,7 +90,6 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {

  config.SwitchUseFeedFetchOps(false);

-  int head_number = 12;
  int batch = 1;
  int min_seq_len = 1;
  int max_seq_len = 128;
@@ -104,17 +103,17 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
      {"read_file_0.tmp_0", min_shape},
      {"read_file_0.tmp_1", min_shape},
      {"read_file_0.tmp_2", min_shape},
-      {"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}};
+      {"matmul_0.tmp_0", {batch, min_seq_len, min_seq_len}}};
  std::map<std::string, std::vector<int>> max_input_shape = {
      {"read_file_0.tmp_0", max_shape},
      {"read_file_0.tmp_1", max_shape},
      {"read_file_0.tmp_2", max_shape},
-      {"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}};
+      {"matmul_0.tmp_0", {batch, max_seq_len, max_seq_len}}};
  std::map<std::string, std::vector<int>> opt_input_shape = {
      {"read_file_0.tmp_0", opt_shape},
      {"read_file_0.tmp_1", opt_shape},
      {"read_file_0.tmp_2", opt_shape},
-      {"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}};
+      {"matmul_0.tmp_0", {batch, opt_seq_len, opt_seq_len}}};

  auto precision = AnalysisConfig::Precision::kFloat32;
  if (with_fp16) {

--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -90,7 +90,6 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {

  config.SwitchUseFeedFetchOps(false);

-  int head_number = 12;
  int batch = 1;
  int min_seq_len = 1;
  int max_seq_len = 128;
@@ -104,17 +103,17 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
      {"read_file_0.tmp_0", min_shape},
      {"read_file_0.tmp_1", min_shape},
      {"read_file_0.tmp_2", min_shape},
-      {"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}};
+      {"matmul_0.tmp_0", {batch, min_seq_len, min_seq_len}}};
  std::map<std::string, std::vector<int>> max_input_shape = {
      {"read_file_0.tmp_0", max_shape},
      {"read_file_0.tmp_1", max_shape},
      {"read_file_0.tmp_2", max_shape},
-      {"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}};
+      {"matmul_0.tmp_0", {batch, max_seq_len, max_seq_len}}};
  std::map<std::string, std::vector<int>> opt_input_shape = {
      {"read_file_0.tmp_0", opt_shape},
      {"read_file_0.tmp_1", opt_shape},
      {"read_file_0.tmp_2", opt_shape},
-      {"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}};
+      {"matmul_0.tmp_0", {batch, opt_seq_len, opt_seq_len}}};

  auto precision = AnalysisConfig::Precision::kFloat32;
  if (with_fp16) {

--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -45,7 +45,7 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
 endfunction()

 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
-if(NOT EXISTS ${WORD2VEC_INSTALL_DIR})
+if(NOT EXISTS ${WORD2VEC_INSTALL_DIR}/word2vec.inference.model.tar.gz)
  inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")

--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -62,11 +62,11 @@ __global__ void affine_grid_kernel(const int count, int n, int out_h, int out_w,

    int theta_offset = n * 6;  // 2 * 3;
    // affine from (h_coor, w_coor) to (x, y)
-    output[index * 2] = theta[theta_offset] * h_coor +
-                        theta[theta_offset + 1] * w_coor +
+    output[index * 2] = theta[theta_offset] * w_coor +
+                        theta[theta_offset + 1] * h_coor +
                        theta[theta_offset + 2];
-    output[index * 2 + 1] = theta[theta_offset + 3] * h_coor +
-                            theta[theta_offset + 4] * w_coor +
+    output[index * 2 + 1] = theta[theta_offset + 3] * w_coor +
+                            theta[theta_offset + 4] * h_coor +
                            theta[theta_offset + 5];
  }
 }
@@ -86,13 +86,13 @@ __global__ void affine_grid_grad_kernel(const int count, int n, int out_h,

    int theta_offset = n * 6;  // 2 * 3;
    T out_grad_x = out_grad[index * 2];
-    platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
-    platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * h_coor);
    platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);

    T out_grad_y = out_grad[index * 2 + 1];
-    platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
-    platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * h_coor);
    platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
  }
 }

--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"

 REGISTER_OPERATOR(
@@ -31,3 +32,20 @@ REGISTER_OP_CPU_KERNEL(
                                    int16_t>,
    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
                                    uint8_t>);
+REGISTER_OP_VERSION(arg_max)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade argmax add a new attribute [flatten] and modify the attribute of dtype)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewAttr("flatten",
+                     "In order to compute the argmax over the flattened array "
+                     "when the "
+                     "argument `axis` in python API is None.",
+                     false)
+            .ModifyAttr(
+                "dtype",
+                "change the default value of dtype, the older version "
+                "is -1, means return the int64 indices."
+                "The new version is 3, return the int64 indices directly."
+                "And supporting the dtype of -1 in new version.",
+                3));
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -70,6 +70,8 @@ struct VisitDataArgMinMaxFunctor {
    auto axis = ctx.Attr<int64_t>("axis");
    auto keepdims = ctx.Attr<bool>("keepdims");
    const bool& flatten = ctx.Attr<bool>("flatten");
+    // paddle do not have the scalar tensor, just return the shape [1] tensor
+    if (flatten) keepdims = true;

    // if flatten, will construct the new dims for the cacluate
    framework::DDim x_dims;
@@ -164,15 +166,42 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
        platform::errors::InvalidArgument(
            "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));

+    const int& dtype = ctx->Attrs().Get<int>("dtype");
+    PADDLE_ENFORCE_EQ(
+        (dtype < 0 || dtype == 2 || dtype == 3), true,
+        platform::errors::InvalidArgument(
+            "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
+            "received [%s]",
+            paddle::framework::DataTypeToString(
+                framework::proto::VarType::INT32),
+            paddle::framework::DataTypeToString(
+                framework::proto::VarType::INT64),
+            paddle::framework::DataTypeToString(
+                static_cast<framework::proto::VarType::Type>(dtype))));
+
+    auto x_rank = x_dims.size();
+    if (axis < 0) axis += x_rank;
+    if (ctx->IsRuntime()) {
+      if (dtype == framework::proto::VarType::INT32) {
+        int64_t all_element_num = 0;
+        if (flatten) {
+          all_element_num = framework::product(x_dims);
+
+        } else {
+          all_element_num = x_dims[axis];
+        }
+        PADDLE_ENFORCE_LE(
+            all_element_num, INT_MAX,
+            "The element num of the argmin/argmax input at axis is "
+            "%d, is larger than int32 maximum value:%d, you must "
+            "set the dtype of argmin/argmax to 'int64'.",
+            all_element_num, INT_MAX);
+      }
+    }
    std::vector<int64_t> vec;
    if (flatten) {
-      // if is flatten, will return the only on element
-      if (keepdims) {
-        vec.emplace_back(static_cast<int64_t>(1));
-      }
+      vec.emplace_back(static_cast<int64_t>(1));
    } else {
-      auto x_rank = x_dims.size();
-      if (axis < 0) axis += x_rank;
      for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
      if (keepdims) {
        vec.emplace_back(static_cast<int64_t>(1));
@@ -194,10 +223,14 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "Output tensor.");
    AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
    AddAttr<bool>("keepdims", "Keep the dim that to reduce.").SetDefault(false);
-    AddAttr<int>("dtype", "Keep the dim that to reduce.").SetDefault(-1);
    AddAttr<bool>("flatten",
                  "Flatten the input value, and search the min or max indices")
        .SetDefault(false);
+    AddAttr<int>("dtype",
+                 "(int, 3), the dtype of indices, the indices dtype must be "
+                 "int32, int64."
+                 "default dtype is int64, and proto value is 3.")
+        .SetDefault(3);
    AddComment(string::Sprintf(R"DOC(
      %s Operator.


--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"

 REGISTER_OPERATOR(
@@ -31,3 +32,20 @@ REGISTER_OP_CPU_KERNEL(
                                    int16_t>,
    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
                                    uint8_t>);
+REGISTER_OP_VERSION(arg_min)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade argmin add a new attribute [flatten] and modify the attribute of dtype)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewAttr("flatten",
+                     "In order to compute the argmin over the flattened array "
+                     "when the "
+                     "argument `axis` in python API is None.",
+                     false)
+            .ModifyAttr(
+                "dtype",
+                "change the default value of dtype, the older version "
+                "is -1, means return the int64 indices."
+                "The new version is 3, return the int64 indices directly."
+                "And supporting the dtype of -1 in new version.",
+                3));
--- a/paddle/fluid/operators/bernoulli_op.cu
+++ b/paddle/fluid/operators/bernoulli_op.cu
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>

-#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/bernoulli_op.h"
@@ -31,6 +30,10 @@ struct BernoulliCudaFunctor {
  __host__ __device__ BernoulliCudaFunctor(int seed) : seed_(seed) {}

  __host__ __device__ T operator()(const unsigned int n, const T p) const {
+    // NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several
+    // lines of error messages if, and it should be refined.
+    PADDLE_ENFORCE(p >= 0.0 && p <= 1.0,
+                   "The probability should be >=0 and <= 1, but got %f", p);
    thrust::minstd_rand rng;
    rng.seed(seed_);
    thrust::uniform_real_distribution<T> dist(0.0, 1.0);

--- a/paddle/fluid/operators/bernoulli_op.h
+++ b/paddle/fluid/operators/bernoulli_op.h
@@ -25,10 +25,12 @@ namespace operators {

 template <typename T>
 inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
-  PADDLE_ENFORCE_LE(p, 1, platform::errors::OutOfRange(
-                              "The probability should be <= 1, but got %f", p));
-  PADDLE_ENFORCE_GE(p, 0, platform::errors::OutOfRange(
-                              "The probability should be >= 1, but got %f", p));
+  PADDLE_ENFORCE_LE(p, 1.0,
+                    platform::errors::OutOfRange(
+                        "The probability should be <= 1, but got %f", p));
+  PADDLE_ENFORCE_GE(p, 0.0,
+                    platform::errors::OutOfRange(
+                        "The probability should be >= 0, but got %f", p));
  return static_cast<T>(rand < p);
 }


--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -66,7 +66,7 @@ template <typename DeviceContext, typename T>
 class ClipKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto max = context.Attr<T>("max");
+    auto max = static_cast<T>(context.Attr<float>("max"));
    Tensor max_cpu;
    if (context.HasInput("Max")) {
      auto* max_t = context.Input<Tensor>("Max");
@@ -77,8 +77,9 @@ class ClipKernel : public framework::OpKernel<T> {
      }
      max = max_data[0];
    }
+    max = static_cast<T>(max);

-    auto min = context.Attr<T>("min");
+    auto min = context.Attr<float>("min");
    Tensor min_cpu;
    if (context.HasInput("Min")) {
      auto* min_t = context.Input<Tensor>("Min");
@@ -141,7 +142,7 @@ template <typename DeviceContext, typename T>
 class ClipGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto max = context.Attr<T>("max");
+    auto max = static_cast<T>(context.Attr<float>("max"));
    Tensor max_cpu;
    if (context.HasInput("Max")) {
      auto* max_t = context.Input<Tensor>("Max");
@@ -152,8 +153,9 @@ class ClipGradKernel : public framework::OpKernel<T> {
      }
      max = max_data[0];
    }
+    max = static_cast<T>(max);

-    auto min = context.Attr<T>("min");
+    auto min = context.Attr<float>("min");
    Tensor min_cpu;
    if (context.HasInput("Min")) {
      auto* min_t = context.Input<Tensor>("Min");
@@ -164,6 +166,7 @@ class ClipGradKernel : public framework::OpKernel<T> {
      }
      min = min_data[0];
    }
+    min = static_cast<T>(min);

    auto* d_out =
        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));

--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -62,6 +62,34 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
    gpu_dev_ctx.Wait();
 #else
    PADDLE_THROW("Unexpected branch");
+#endif
+    return true;
+  } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    auto& xpu_dev_ctx = static_cast<const platform::XPUDeviceContext&>(dev_ctx);
+    platform::CPUPlace cpu;
+    char* p = reinterpret_cast<char*>(dest);
+    while (total_written < length) {
+      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
+        return false;
+      }
+
+      if (total_written + size_to_write > length) {
+        size_to_write = length - total_written;
+      }
+
+      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place),
+                   reinterpret_cast<void*>(p), cpu, data, size_to_write);
+      p += size_to_write;
+      total_written += size_to_write;
+      input->Skip(size_to_write);
+    }
+    xpu_dev_ctx.Wait();
+#else
+    PADDLE_ENFORCE_NOT_NULL(
+        nullptr,
+        platform::errors::Unimplemented(
+            "Not supported XPU, please compile with option WITH_XPU=ON."));
 #endif
    return true;
  }

--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
@@ -25,25 +25,32 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs("Ids"),
-                   "Input(Ids) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("W"),
-                   "Input(W) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
-                   "Output(Outs) of LookupTableOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInputs("Ids"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Ids) of LookupTableOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(W) of LookupTableOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutputs("Outputs"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Outs) of LookupTableOp should not be null."));

    auto ids_dims = ctx->GetInputsDim("Ids");
    auto table_dims = ctx->GetInputDim("W");

-    PADDLE_ENFORCE_EQ(table_dims.size(), 2,
-                      "Only 2 dimensions of the 'Embedding' is supported.");
+    PADDLE_ENFORCE_EQ(
+        table_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "Only 2 dimensions of the 'Embedding' is supported."));

    for (auto &ids_dim : ids_dims) {
      PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
-                        "The dimension of the 'Ids' tensor must be 2.");
+                        platform::errors::InvalidArgument(
+                            "The dimension of the 'Ids' tensor must be 2."));
    }

    auto endpoints = ctx->Attrs().Get<std::vector<std::string>>("endpoints");
+    // for fluid.embedding
    auto lookup_table_version =
        ctx->Attrs().Get<std::string>("lookup_table_version");


--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
@@ -35,9 +35,30 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
    auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
    auto is_distributed = context.Attr<bool>("is_distributed");

+    auto lookup_table_version =
+        context.Attr<std::string>("lookup_table_version");
+
    operators::distributed::prefetchs(id_names, out_names, embedding_name,
                                      is_distributed, lookup_tables, endpoints,
                                      context, context.scope());
+
+    if (lookup_table_version == "lookup_table_v2") {
+      auto &scope = context.scope();
+      auto emb_dim =
+          scope.FindVar(embedding_name)->Get<framework::LoDTensor>().dims()[1];
+
+      for (size_t i = 0; i < id_names.size(); ++i) {
+        auto *id_var = scope.FindVar(id_names[i]);
+        auto *out_var = scope.FindVar(out_names[i]);
+        auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
+        auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
+
+        auto id_dims = id_tensor->dims();
+        out_tensor->Resize(framework::make_ddim(
+            {static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
+             static_cast<int64_t>(emb_dim)}));
+      }
+    }
  }
 };


--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -96,6 +96,42 @@ __global__ void RandomGeneratorWithSeed(const size_t n, const int* seed,
  }
 }

+template <typename T, typename MaskType>
+__global__ void RandomGeneratorWithGenerator(const size_t n, uint64_t seed,
+                                             const float dropout_prob,
+                                             const T* src, MaskType* mask_data,
+                                             T* dst, bool is_upscale_in_train,
+                                             uint64_t increment) {
+  curandStatePhilox4_32_10_t state;
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = 0;
+
+  MaskType mask;
+  T dest;
+  for (; idx < n; idx += blockDim.x * gridDim.x) {
+    T s = src[idx];
+    if (step_size == 0) {
+      curand_init(seed, idx, increment, &state);
+      step_size = blockDim.x * gridDim.x;
+    } else {
+      curand_init(seed, idx, increment, &state);
+    }
+    if (curand_uniform(&state) < dropout_prob) {
+      mask = 0;
+      dest = 0;
+    } else {
+      mask = 1;
+      if (is_upscale_in_train) {
+        dest = s / static_cast<T>(1.0f - dropout_prob);
+      } else {
+        dest = s;
+      }
+    }
+    mask_data[idx] = mask;
+    dst[idx] = dest;
+  }
+}
+
 // It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -150,6 +186,17 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
            context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
      }

+      int device_id = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace())
+                          .GetDeviceId();
+      auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+      if (gen_cuda->GetIsInitPy() && (!context.Attr<bool>("fix_seed"))) {
+        auto seed_offset = gen_cuda->IncrementOffset(1);
+        RandomGeneratorWithGenerator<T, uint8_t><<<grid, threads, 0, stream>>>(
+            size, seed_offset.first, dropout_prob, x_data, mask_data, y_data,
+            upscale_in_train, seed_offset.second);
+        return;
+      }
+
      RandomGenerator<T, uint8_t><<<grid, threads, 0, stream>>>(
          size, seed_data, dropout_prob, x_data, mask_data, y_data,
          upscale_in_train);

--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -29,6 +29,10 @@ template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;

+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 template <typename DeviceContext, typename T>
 class CPUDropoutKernel : public framework::OpKernel<T> {
 public:
@@ -116,9 +120,9 @@ class DropoutGradKernel : public framework::OpKernel<T> {
    auto* mask = context.Input<Tensor>("Mask");
    grad_x->mutable_data<T>(context.GetPlace());

-    auto M = EigenMatrix<uint8_t>::Reshape(*mask, 1);
-    auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
-    auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
+    auto M = EigenVector<uint8_t>::Flatten(*mask);
+    auto dX = EigenVector<T>::Flatten(*grad_x);
+    auto dY = EigenVector<T>::Flatten(*grad_y);

    auto& place =
        *context.template device_context<DeviceContext>().eigen_device();

--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
@@ -31,6 +31,15 @@ struct ModFunctor {
  }
 };

+template <typename T>
+struct InverseModFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    T res = b % a;
+    if ((res != 0) && ((res < 0) != (a < 0))) res += a;
+    return res;
+  }
+};
+
 template <typename T>
 struct ModFunctorFP {
  inline HOSTDEVICE T operator()(T a, T b) const {
@@ -40,13 +49,29 @@ struct ModFunctorFP {
  }
 };

+template <typename T>
+struct InverseModFunctorFP {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    T res = fmod(b, a);
+    if ((res != 0) && ((a < 0) != (res < 0))) res += a;
+    return res;
+  }
+};
+
 template <typename DeviceContext, typename T>
 void elementwise_mod(const framework::ExecutionContext &ctx,
                     const framework::Tensor *x, const framework::Tensor *y,
                     framework::Tensor *z) {
  int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                        ModFunctor<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          ModFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseModFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseModFunctor<T>(), z);
+  }
 }

 template <typename DeviceContext, typename T>
@@ -54,8 +79,15 @@ void elementwise_mod_fp(const framework::ExecutionContext &ctx,
                        const framework::Tensor *x, const framework::Tensor *y,
                        framework::Tensor *z) {
  int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<ModFunctorFP<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          ModFunctorFP<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<ModFunctorFP<T>, DeviceContext, T>(
+        ctx, x, y, axis, ModFunctorFP<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseModFunctorFP<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseModFunctorFP<T>(), z);
+  }
 }

 template <typename DeviceContext, typename T>

--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
@@ -24,15 +25,20 @@ template <typename T>
 struct GaussianGenerator {
  T mean_, std_;
  unsigned int seed_;
+  unsigned int offset_ = 0;

  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
      : mean_(mean), std_(std), seed_(seed) {}

+  __host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset)
+      : mean_(mean), std_(std), seed_(seed), offset_(offset) {}
+
  __host__ __device__ T operator()(const unsigned int n) const {
    thrust::minstd_rand rng;
    rng.seed(seed_);
    thrust::normal_distribution<T> dist(mean_, std_);
-    rng.discard(n);
+    unsigned int new_n = n + offset_;
+    rng.discard(new_n);
    return dist(rng);
  }
 };
@@ -43,9 +49,11 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& context) const override {
    auto* tensor = context.Output<framework::Tensor>("Out");
    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    bool seed_flag = false;
    if (seed == 0) {
      std::random_device rd;
      seed = rd();
+      seed_flag = true;
    }
    T mean = static_cast<T>(context.Attr<float>("mean"));
    T std = static_cast<T>(context.Attr<float>("std"));
@@ -56,9 +64,23 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
    T* data = tensor->mutable_data<T>(context.GetPlace());

    int64_t size = tensor->numel();
-    thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      GaussianGenerator<T>(mean, std, seed));
+
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset));
+    } else {
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GaussianGenerator<T>(mean, std, seed));
+    }
  }
 };

@@ -69,17 +91,33 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
    auto* tensor = context.Output<framework::Tensor>("Out");
    T* data = tensor->mutable_data<T>(context.GetPlace());
    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    bool seed_flag = false;
    if (seed == 0) {
      std::random_device rd;
      seed = rd();
+      seed_flag = true;
    }
    T mean = static_cast<T>(context.Attr<float>("mean"));
    T std = static_cast<T>(context.Attr<float>("std"));
    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
    int64_t size = tensor->numel();
-    thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      GaussianGenerator<T>(mean, std, seed));
+
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GaussianGenerator<T>(mean, std, seed_offset.first,
+                                             seed_offset.second));
+    } else {
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GaussianGenerator<T>(mean, std, seed));
+    }
  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/linspace_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -19,6 +20,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

+using Tensor = framework::Tensor;
+
 template <typename T>
 __global__ void LinspaceKernel(T start, double step, int64_t size, T* out) {
  CUDA_KERNEL_LOOP(index, size) {
@@ -35,15 +38,27 @@ template <typename T>
 class CUDALinspaceKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* start_t = context.Input<framework::Tensor>("Start");
-    auto* stop_t = context.Input<framework::Tensor>("Stop");
+    auto* pre_start = context.Input<framework::Tensor>("Start");
+    auto* pre_stop = context.Input<framework::Tensor>("Stop");
    auto* num_t = context.Input<framework::Tensor>("Num");
    auto* out = context.Output<framework::Tensor>("Out");
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+
+    Tensor start_t;
+    Tensor stop_t;
+    auto start_dtype =
+        framework::OpKernelType(pre_start->type(), context.GetPlace());
+    auto stop_dtype =
+        framework::OpKernelType(pre_stop->type(), context.GetPlace());
+    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
+    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
+    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);

    framework::Tensor n;
-    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
+    framework::TensorCopy(start_t, platform::CPUPlace(), &n);
    T start = n.data<T>()[0];
-    framework::TensorCopy(*stop_t, platform::CPUPlace(), &n);
+    framework::TensorCopy(stop_t, platform::CPUPlace(), &n);
    T stop = n.data<T>()[0];
    framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
    int32_t num = n.data<int32_t>()[0];

--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -14,20 +14,38 @@ limitations under the License. */

 #pragma once
 #include <functional>
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"

 namespace paddle {
 namespace operators {

+using Tensor = framework::Tensor;
+
 template <typename T>
 class CPULinspaceKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
-    T stop = context.Input<framework::Tensor>("Stop")->data<T>()[0];
+    auto* pre_start = context.Input<framework::Tensor>("Start");
+    auto* pre_stop = context.Input<framework::Tensor>("Stop");
    int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
    auto* out = context.Output<framework::Tensor>("Out");
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+
+    Tensor start_t;
+    Tensor stop_t;
+    auto start_dtype =
+        framework::OpKernelType(pre_start->type(), context.GetPlace());
+    auto stop_dtype =
+        framework::OpKernelType(pre_stop->type(), context.GetPlace());
+    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
+    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
+    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
+
+    T start = start_t.data<T>()[0];
+    T stop = stop_t.data<T>()[0];
    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");

    out->Resize(framework::make_ddim({num}));

--- a/paddle/fluid/operators/load_combine_op.h
+++ b/paddle/fluid/operators/load_combine_op.h
@@ -70,6 +70,7 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
    auto out_vars = context.MultiOutputVar("Out");

    for (size_t i = 0; i < out_var_names.size(); i++) {
+      VLOG(4) << "loading tensor: " << out_var_names[i];
      PADDLE_ENFORCE_NOT_NULL(
          out_vars[i], platform::errors::InvalidArgument(
                           "The variable %s to be loaded cannot be found.",

--- a/paddle/fluid/operators/lookup_table_v2_op.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_v2_op.h"

 #include <memory>
-
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"

 namespace paddle {
@@ -196,3 +196,14 @@ REGISTER_OP_CPU_KERNEL(lookup_table_v2, ops::LookupTableV2Kernel<float>,
 REGISTER_OP_CPU_KERNEL(lookup_table_v2_grad,
                       ops::LookupTableV2GradKernel<float>,
                       ops::LookupTableV2GradKernel<double>);
+
+/* ==========================  register checkpoint ===========================*/
+REGISTER_OP_VERSION(lookup_table_v2)
+    .AddCheckpoint(
+        R"ROC(fix lookup_table_v2, add input type `int32`)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .BugfixWithBehaviorChanged("lookup_table_v2 support input type "
+                                       "`int64`; after support input type "
+                                       "`int32/int64`"));
+
+/* ========================================================================== */
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -85,6 +85,14 @@ __global__ void LookupTableV2Grad(T *table, const T *output, const int64_t *ids,
  }
 }

+template <typename T>
+__global__ void InputTypeCovert(const T *in_ids, const int64_t K,
+                                int64_t *out_ids) {
+  for (int i = 0; i < K; i++) {
+    out_ids[i] = (int64_t)(in_ids[i]);
+  }
+}
+
 template <typename T>
 class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
 public:
@@ -101,23 +109,37 @@ class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
    size_t D = table_t->dims()[1];
    size_t K = ids_t->numel();

-    auto *ids = ids_t->data<int64_t>();
-    auto *table = table_t->data<T>();
-    auto *output = output_t->mutable_data<T>(context.GetPlace());
-
    dim3 threads(256, 4);
    dim3 grids(80, 1);

+    // copy GPU memory to CPU pinned memory
+    framework::Vector<int64_t> ids;
+    ids.resize(K);
+
+    const int64_t *ids_p = nullptr;
+
+    if (ids_t->type() == framework::proto::VarType::INT32) {
+      InputTypeCovert<
+          int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+          ids_t->data<int>(), K, ids.MutableData(context.GetPlace()));
+      ids_p = ids.MutableData(context.GetPlace());
+    } else {
+      ids_p = ids_t->data<int64_t>();
+    }
+
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
+
    if (padding_idx == -1)
      LookupTableV2<
          T, 256, 4, 80,
          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+          output, table, ids_p, N, K, D, padding_idx);
    else
      LookupTableV2<
          T, 256, 4, 80,
          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+          output, table, ids_p, N, K, D, padding_idx);
  }
 };

@@ -139,16 +161,24 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {

      auto *ids_data = ids->data<int64_t>();
      int64_t ids_num = ids->numel();
-
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
      auto stream = dev_ctx.stream();
      // copy GPU memory to CPU pinned memory
      framework::Vector<int64_t> new_rows;
      new_rows.resize(ids_num);
      auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());

-      // TODO(yuyang18): Strange code here.
-      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
-                   gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
+      if (ids->type() == framework::proto::VarType::INT32) {
+        InputTypeCovert<
+            int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+            ids->data<int>(), ids_num,
+            new_rows.MutableData(context.GetPlace()));
+      } else {
+        memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
+                     gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
+      }
+
      d_table->set_rows(new_rows);

      auto *d_table_value = d_table->mutable_value();
@@ -177,17 +207,32 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
      int N = d_table_t->dims()[0];
      int D = d_table_t->dims()[1];
      int K = ids_t->numel();
-      const int64_t *ids = ids_t->data<int64_t>();
+
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
+      // copy GPU memory to CPU pinned memory
+      framework::Vector<int64_t> ids;
+      ids.resize(K);
+
+      const int64_t *ids_p = nullptr;
+
+      if (ids_t->type() == framework::proto::VarType::INT32) {
+        InputTypeCovert<
+            int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+            ids_t->data<int>(), K, ids.MutableData(context.GetPlace()));
+        ids_p = ids.MutableData(context.GetPlace());
+      } else {
+        ids_p = ids_t->data<int64_t>();
+      }
+
      const T *d_output = d_output_t->data<T>();
      T *d_table = d_table_t->mutable_data<T>(context.GetPlace());

      auto t = framework::EigenVector<T>::Flatten(*d_table_t);
      t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));

-      dim3 threads(128, 8);
-      dim3 grids(8, 1);
      LookupTableV2Grad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
-          d_table, d_output, ids, N, K, D);
+          d_table, d_output, ids_p, N, K, D);
    }
  }
 };

--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <algorithm>
 #include <string>
 #include <vector>

@@ -45,84 +46,70 @@ class LookupTableV2Kernel : public framework::OpKernel<T> {
    auto *output_t = context.Output<LoDTensor>("Out");  // float tensor
    auto *table_var = context.InputVar("W");

-    auto id_name = context.InputNames("Ids").front();
-    auto embedding_name = context.InputNames("W").front();
-    auto out_name = context.OutputNames("Out").front();
-
-    // for remote prefetch
-    auto epmap = context.Attr<std::vector<std::string>>("epmap");
-    auto remote_prefetch = context.Attr<bool>("remote_prefetch");
-    auto table_names = context.Attr<std::vector<std::string>>("table_names");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    int64_t ids_numel = ids_t->numel();

-    if (remote_prefetch && !epmap.empty()) {
-// if epmap is not empty, then the parameter will be fetched from remote
-// parameter server
+    std::vector<int64_t> ids;
+    ids.reserve(ids_numel);

-#ifdef PADDLE_WITH_DISTRIBUTE
-      operators::distributed::prefetch(id_name, out_name, embedding_name, false,
-                                       table_names, epmap, context,
-                                       context.scope());
-#else
-      PADDLE_THROW(
-          "paddle is not compiled with distribute support, can not do "
-          "parameter prefetch!");
-#endif
+    if (ids_t->type() == framework::proto::VarType::INT32) {
+      std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_numel,
+                     std::back_inserter(ids),
+                     [&](int id) { return static_cast<int64_t>(id); });
    } else {
-      int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-      int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
-      int64_t ids_numel = ids_t->numel();
-
-      if (table_var->IsType<LoDTensor>()) {
-        auto *table_t = context.Input<LoDTensor>("W");
-        int64_t row_number = table_t->dims()[0];
-        int64_t row_width = table_t->dims()[1];
-
-        auto *table = table_t->data<T>();
-        auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-        for (int64_t i = 0; i < ids_numel; ++i) {
-          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-            memset(output + i * row_width, 0, row_width * sizeof(T));
-          } else {
-            PADDLE_ENFORCE_LT(
-                ids[i], row_number,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0 and < %ld, but got %ld. Please check input "
-                "value.",
-                row_number, ids[i]);
-            PADDLE_ENFORCE_GE(
-                ids[i], 0,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0 and < %ld, but got %ld. Please check input "
-                "value.",
-                row_number, ids[i]);
-            memcpy(output + i * row_width, table + ids[i] * row_width,
-                   row_width * sizeof(T));
-          }
+      framework::TensorToVector(*ids_t, &ids);
+    }
+
+    if (table_var->IsType<LoDTensor>()) {
+      auto *table_t = context.Input<LoDTensor>("W");
+      int64_t row_number = table_t->dims()[0];
+      int64_t row_width = table_t->dims()[1];
+
+      auto *table = table_t->data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+      for (int64_t i = 0; i < ids_numel; ++i) {
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(
+              ids[i], row_number,
+              "Variable value (input) of OP(fluid.layers.embedding) "
+              "expected >= 0 and < %ld, but got %ld. Please check input "
+              "value.",
+              row_number, ids[i]);
+          PADDLE_ENFORCE_GE(
+              ids[i], 0,
+              "Variable value (input) of OP(fluid.layers.embedding) "
+              "expected >= 0 and < %ld, but got %ld. Please check input "
+              "value.",
+              row_number, ids[i]);
+          memcpy(output + i * row_width, table + ids[i] * row_width,
+                 row_width * sizeof(T));
        }
-      } else if (table_var->IsType<SelectedRows>()) {
-        const auto &table_t = table_var->Get<SelectedRows>();
-        int64_t row_width = table_t.value().dims()[1];
-        const auto *table = table_t.value().data<T>();
-        auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-        auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-        for (int64_t i = 0; i < ids_numel; ++i) {
-          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-            memset(output + i * row_width, 0, row_width * sizeof(T));
-          } else {
-            PADDLE_ENFORCE_GE(
-                ids[i], 0,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0. But received %ld",
-                ids[i]);
-            auto id_index = table_t.Index(ids[i]);
-            PADDLE_ENFORCE_GE(
-                id_index, 0, "the input key should be exists. But received %d.",
-                id_index);
-            blas.VCOPY(row_width, table + id_index * row_width,
-                       output + i * row_width);
-          }
+      }
+    } else if (table_var->IsType<SelectedRows>()) {
+      const auto &table_t = table_var->Get<SelectedRows>();
+      int64_t row_width = table_t.value().dims()[1];
+      const auto *table = table_t.value().data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      for (int64_t i = 0; i < ids_numel; ++i) {
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_GE(
+              ids[i], 0,
+              "Variable value (input) of OP(fluid.layers.embedding) "
+              "expected >= 0. But received %ld",
+              ids[i]);
+          auto id_index = table_t.Index(ids[i]);
+          PADDLE_ENFORCE_GE(id_index, 0,
+                            "the input key should be exists. But received %d.",
+                            id_index);
+          blas.VCOPY(row_width, table + id_index * row_width,
+                     output + i * row_width);
        }
      }
    }
@@ -151,17 +138,23 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
    // Since paddings are not trainable and fixed in forward, the gradient of
    // paddings makes no sense and we don't deal with it in backward.
    if (is_sparse) {
-      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *ids_t = context.Input<LoDTensor>("Ids");
      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      int64_t ids_num = ids_t->numel();
+
+      std::vector<int64_t> ids;
+      ids.reserve(ids_num);

-      auto *ids_data = ids->data<int64_t>();
-      int64_t ids_num = ids->numel();
+      if (ids_t->type() == framework::proto::VarType::INT32) {
+        std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_num,
+                       std::back_inserter(ids),
+                       [&](int id) { return static_cast<int64_t>(id); });
+      } else {
+        framework::TensorToVector(*ids_t, &ids);
+      }

-      std::vector<int64_t> new_rows;
-      new_rows.resize(ids_num);
-      std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t));
-      d_table->set_rows(new_rows);
+      d_table->set_rows(ids);

      auto *d_table_value = d_table->mutable_value();
      d_table_value->Resize({ids_num, table_dim[1]});
@@ -185,11 +178,23 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());

    } else {
-      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *ids_t = context.Input<LoDTensor>("Ids");
      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
      auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
+      int64_t ids_num = ids_t->numel();
+
+      std::vector<int64_t> ids;
+      ids.reserve(ids_num);
+
+      if (ids_t->type() == framework::proto::VarType::INT32) {
+        std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_num,
+                       std::back_inserter(ids),
+                       [&](int id) { return static_cast<int64_t>(id); });
+      } else {
+        framework::TensorToVector(*ids_t, &ids);
+      }

-      auto *ids_data = ids->data<int64_t>();
+      auto *ids_data = ids.data();

      int64_t N = table_dim[0];
      int64_t D = table_dim[1];
@@ -199,7 +204,7 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {

      memset(d_table_data, 0, d_table->numel() * sizeof(T));

-      for (int64_t i = 0; i < ids->numel(); ++i) {
+      for (int64_t i = 0; i < ids_num; ++i) {
        if (padding_idx != kNoPadding && ids_data[i] == padding_idx) {
          // the gradient of padding_idx should be 0, already done by memset, so
          // do nothing.

--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -65,13 +65,14 @@ class SplitFunctor {
 }  // namespace operators
 }  // namespace paddle

-#define FOR_ALL_TYPES(macro) \
-  macro(int);                \
-  macro(float);              \
-  macro(double);             \
-  macro(bool);               \
-  macro(int64_t);            \
-  macro(int16_t);            \
-  macro(uint8_t);            \
-  macro(int8_t);             \
-  macro(::paddle::platform::float16)
+#define FOR_ALL_TYPES(macro)          \
+  macro(int);                         \
+  macro(float);                       \
+  macro(double);                      \
+  macro(bool);                        \
+  macro(int64_t);                     \
+  macro(int16_t);                     \
+  macro(uint8_t);                     \
+  macro(int8_t);                      \
+  macro(::paddle::platform::float16); \
+  macro(::paddle::platform::bfloat16)
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -34,6 +34,7 @@ namespace math {
 using float16 = paddle::platform::float16;

 template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
 template struct SetConstant<platform::CPUDeviceContext, float>;
 template struct SetConstant<platform::CPUDeviceContext, double>;
 template struct SetConstant<platform::CPUDeviceContext, int>;
@@ -41,16 +42,18 @@ template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
 template struct SetConstant<platform::CPUDeviceContext, uint8_t>;

-#define DEFINE_CPU_TRANS(RANK)                                             \
-  template struct Transpose<platform::CPUDeviceContext, platform::float16, \
-                            RANK>;                                         \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;      \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;       \
-  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;    \
+#define DEFINE_CPU_TRANS(RANK)                                              \
+  template struct Transpose<platform::CPUDeviceContext, platform::float16,  \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, platform::bfloat16, \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;       \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;         \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;     \
  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;

 DEFINE_CPU_TRANS(1);

--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -33,10 +33,12 @@ class MKLDNNActivationKernel
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    const auto *x = ctx.Input<Tensor>("X");
-    PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for X tensor");
-    PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for X tensor");
+    PADDLE_ENFORCE_EQ(
+        x->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument("Wrong layout set for X tensor"));
+    PADDLE_ENFORCE_NE(
+        x->format(), MKLDNNMemoryFormat::undef,
+        platform::errors::InvalidArgument("Wrong format set for X tensor"));

    Functor functor;
    functor(ctx);
@@ -50,9 +52,11 @@ class MKLDNNActivationGradKernel
  void Compute(const framework::ExecutionContext &ctx) const override {
    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
    PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input OutGrad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input OutGrad tensor"));
    PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input OutGrad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input OutGrad tensor"));

    Functor functor;
    functor(ctx);
@@ -82,7 +86,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx,

  PADDLE_ENFORCE(
      x->dims().size() == 2 || x->dims().size() == 3 || x->dims().size() == 4,
-      "Input dim must be with 2, 3 or 4");
+      platform::errors::Unimplemented("Input dim must be with 2, 3 or 4"));

  auto src_tz = framework::vectorize<int64_t>(x->dims());


--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -262,9 +262,11 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));

    PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input diff_y tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input diff_y tensor"));
    PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input diff_y tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input diff_y tensor"));

    auto src_tz = paddle::framework::vectorize<int64_t>(x->dims());
    auto scale_tz = paddle::framework::vectorize<int64_t>(scale->dims());

--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -30,10 +30,12 @@ using platform::to_void_cast;

 static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
  for (auto* input : inputs) {
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input tensor");
+    PADDLE_ENFORCE_EQ(
+        input->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument("Wrong layout set for Input tensor"));
+    PADDLE_ENFORCE_NE(
+        input->format(), MKLDNNMemoryFormat::undef,
+        platform::errors::InvalidArgument("Wrong format set for Input tensor"));
  }
 }

@@ -49,7 +51,7 @@ static platform::CPUPlace GetCpuPlace(
    const paddle::framework::ExecutionContext& ctx) {
  auto place = ctx.GetPlace();
  PADDLE_ENFORCE(paddle::platform::is_cpu_place(place),
-                 "It must use CPUPlace.");
+                 platform::errors::InvalidArgument("It must use CPUPlace."));
  return BOOST_GET_CONST(platform::CPUPlace, place);
 }


--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -561,7 +561,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {

      PADDLE_ENFORCE_EQ(
          !fuse_residual_conn || !force_fp32_output, true,
-          "residual fusion does not support force output with fp32");
+          platform::errors::Unimplemented(
+              "residual fusion does not support force output with fp32"));

      auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;

@@ -625,7 +626,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
              ? dilations.size() == 3 && dilations[0] == 1 &&
                    dilations[1] == 1 && dilations[2] == 1
              : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-          true, "dilation in convolution is not implemented yet");
+          true, platform::errors::Unimplemented(
+                    "dilation in convolution is not implemented yet"));

      const K* filter_data = filter->data<K>();
      auto scale_in_data = ctx.Attr<float>("Scale_in");
@@ -887,7 +889,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
            "The output_grad tensor's layout should be %d, but got %d.",
            DataLayout::kMKLDNN, output_grad->layout()));
    PADDLE_ENFORCE_NE(output_grad->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for output_grad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for output_grad tensor"));

    PADDLE_ENFORCE_EQ(
        ctx.Attr<bool>("is_test"), false,
@@ -1052,7 +1055,11 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
      astream.wait();

      filter_grad->set_layout(DataLayout::kMKLDNN);
-      filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
+      // in OneDNN groups in convolution are treated as separate dimension
+      // which is not the case in paddlepaddle
+      auto filter_fmt = GetMKLDNNFormat(*diff_weights_memory_p);
+      filter_grad->set_format(platform::MKLDNNFormatForSize(
+          g > 1 ? weights_tz.size() - 1 : weights_tz.size(), filter_fmt));
    }
    if (input_grad) {
      auto weights_memory_p = handler.AcquireWeightsMemoryFromDataPrimitive(

--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -117,7 +117,8 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {

    PADDLE_ENFORCE(
        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-        "dilation in convolution is not implemented yet");
+        platform::errors::Unimplemented(
+            "dilation in convolution is not implemented yet"));

    const T* input_data = input->data<T>();
    const T* filter_data = filter->data<T>();

--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -83,19 +83,24 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
    Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));

-    PADDLE_ENFORCE_EQ(in_x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(in_x->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input tensor");
+    PADDLE_ENFORCE_EQ(
+        in_x->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument("Wrong layout set for Input tensor"));
+    PADDLE_ENFORCE_NE(
+        in_x->format(), MKLDNNMemoryFormat::undef,
+        platform::errors::InvalidArgument("Wrong format set for Input tensor"));

    PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input output_grad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input output_grad tensor"));
    PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input output_grad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input output_grad tensor"));

    PADDLE_ENFORCE_EQ(
        ctx.Attr<bool>("is_test"), false,
-        "is_test attribute should be set to False in training phase.");
+        platform::errors::InvalidArgument(
+            "is_test attribute should be set to False in training phase."));

    std::string pooling_type = ctx.Attr<std::string>("pooling_type");


--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -140,7 +140,8 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {

    PADDLE_ENFORCE_EQ(
        dout->dims(), dx->dims(),
-        "The shape of softmax_grad's input and output must be identical.");
+        platform::errors::InvalidArgument(
+            "The shape of softmax_grad's input and output must be identical."));

    auto dims = dout->dims();  // input and output share the same shape
    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());

--- a/paddle/fluid/operators/randint_op.cu
+++ b/paddle/fluid/operators/randint_op.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/uniform_random_op.h"

@@ -49,15 +50,23 @@ class GPURandintKernel : public framework::OpKernel<T> {

    int64_t size = out->numel();
    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+
+    /*
    std::minstd_rand engine;
    if (seed == 0) {
      std::random_device rd;
      seed = rd();
    }
    engine.seed(seed);
+    */
+
    std::uniform_int_distribution<> dist(context.Attr<int>("low"),
                                         context.Attr<int>("high") - 1);
-    for (int64_t i = 0; i < size; ++i) data[i] = dist(engine);
+    auto engine = framework::GetCPURandomEngine(seed);
+
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
+    }

    if (platform::is_gpu_place(context.GetPlace())) {
      // Copy tensor to out

--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include <limits>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"

@@ -46,6 +47,37 @@ struct TruncatedNormal {
  }
 };

+template <typename T>
+struct TruncatedNormalOffset {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  unsigned int seed;
+  T numeric_min;
+  int offset_;
+
+  __host__ __device__ TruncatedNormalOffset(T mean, T std, T numeric_min,
+                                            int seed, int offset)
+      : mean(mean),
+        std(std),
+        seed(seed),
+        numeric_min(numeric_min),
+        offset_(offset) {
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
+  }
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
+    rng.discard(n);
+    T value = dist(rng);
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
+  }
+};
+
 template <typename T>
 class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
 public:
@@ -54,14 +86,31 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
    T* data = tensor->mutable_data<T>(context.GetPlace());

    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    bool seed_flag = false;
    if (seed == 0) {
      std::random_device rd;
      seed = rd();
+      seed_flag = true;
    }
    T mean = static_cast<T>(context.Attr<float>("mean"));
    T std = static_cast<T>(context.Attr<float>("std"));
    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
    int64_t size = tensor->numel();
+
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          TruncatedNormalOffset<T>(mean, std, std::numeric_limits<T>::min(),
+                                   seed_offset.first, seed_offset.second));
+    }
+
    thrust::transform(
        index_sequence_begin, index_sequence_begin + size,
        thrust::device_ptr<T>(data),

--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -51,6 +51,39 @@ struct UniformGenerator {
  }
 };

+template <typename T>
+struct UniformGeneratorOffset {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  int offset_;
+  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
+                                             int diag_num, int diag_step,
+                                             T diag_val, int offset)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val),
+        offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n + offset_);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
 // It seems that Eigen::Tensor::random in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -89,10 +122,11 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
    }
    T* data = tensor->mutable_data<T>(context.GetPlace());
    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-
+    bool seed_flag = false;
    if (seed == 0) {
      std::random_device rd;
      seed = rd();
+      seed_flag = true;
    }

    T min = static_cast<T>(context.Attr<float>("min"));
@@ -104,10 +138,23 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
    T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
    int64_t size = tensor->numel();
-    thrust::transform(
-        index_sequence_begin, index_sequence_begin + size,
-        thrust::device_ptr<T>(data),
-        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
+                                    diag_step, diag_val, gen_offset));
+    } else {
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
+    }
  }
 };


--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/unsqueeze_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"

 namespace paddle {
@@ -327,6 +329,7 @@ REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
 REGISTER_OP_CPU_KERNEL(
    unsqueeze, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -334,12 +337,14 @@ REGISTER_OP_CPU_KERNEL(
    unsqueeze_grad,
    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, bool>,
    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
    unsqueeze2, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -347,6 +352,7 @@ REGISTER_OP_CPU_KERNEL(
    unsqueeze2_grad,
    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/unsqueeze_op.cu.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cu.cc
@@ -21,6 +21,7 @@ REGISTER_OP_CUDA_KERNEL(
    unsqueeze, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -30,6 +31,7 @@ REGISTER_OP_CUDA_KERNEL(
    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
                             plat::float16>,
+    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, bool>,
    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -38,6 +40,7 @@ REGISTER_OP_CUDA_KERNEL(
    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -47,6 +50,7 @@ REGISTER_OP_CUDA_KERNEL(
    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
                              plat::float16>,
+    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -136,6 +136,8 @@ cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)

+cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
+
 nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)

 nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)

--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <limits>
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
+
+#include <cstring>
+#include "paddle/fluid/platform/hostdevice.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace platform {
+
+struct PADDLE_ALIGN(2) bfloat16 {
+ public:
+  uint16_t x;
+
+  bfloat16() = default;
+  bfloat16(const bfloat16& o) = default;
+  bfloat16& operator=(const bfloat16& o) = default;
+  bfloat16(bfloat16&& o) = default;
+  bfloat16& operator=(bfloat16&& o) = default;
+  ~bfloat16() = default;
+
+  HOSTDEVICE inline explicit bfloat16(float val) {
+    std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+  }
+
+  template <class T>
+  HOSTDEVICE inline explicit bfloat16(const T& val)
+      : x(bfloat16(static_cast<float>(val)).x) {}
+
+  HOSTDEVICE inline bfloat16& operator=(bool b) {
+    x = b ? 0x3f80 : 0;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(float val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(double val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline explicit operator float() const {
+    float val = 0.f;
+    uint16_t temp = x;
+    memcpy(reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp),
+           2);
+    return val;
+  }
+
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(static_cast<float>(*this));
+  }
+};
+
+HOSTDEVICE inline bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator-(const bfloat16& a) {
+  bfloat16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+HOSTDEVICE inline bfloat16& operator+=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) + static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator-=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) - static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator*=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) * static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator/=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) / static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
+  bfloat16 res;
+  res.x = a;
+  return res;
+}
+
+HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) == static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator!=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator<(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator<=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator>(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator>=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+  return (a.x & 0x7FFF) > 0x7F80;
+}
+
+HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+  return (a.x & 0x7F80) == 0x7F80;
+}
+
+HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+  return !((isnan)(a)) && !((isinf)(a));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) {
+  os << a.x;
+  return os;
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+namespace std {
+
+template <>
+struct is_pod<paddle::platform::bfloat16> {
+  static const bool value =
+      is_trivial<paddle::platform::bfloat16>::value &&
+      is_standard_layout<paddle::platform::bfloat16>::value;
+};
+
+template <>
+struct is_floating_point<paddle::platform::bfloat16>
+    : std::integral_constant<
+          bool, std::is_same<paddle::platform::bfloat16,
+                             typename std::remove_cv<
+                                 paddle::platform::bfloat16>::type>::value> {};
+template <>
+struct is_signed<paddle::platform::bfloat16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<paddle::platform::bfloat16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const paddle::platform::bfloat16& a) {
+  return paddle::platform::isnan(a);
+}
+
+inline bool isinf(const paddle::platform::bfloat16& a) {
+  return paddle::platform::isinf(a);
+}
+
+template <>
+struct numeric_limits<paddle::platform::bfloat16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 8;
+  static const int digits10 = 2;
+  static const int max_digits10 = 9;
+  static const int radix = 2;
+  static const int min_exponent = -125;
+  static const int min_exponent10 = -37;
+  static const int max_exponent = 128;
+  static const int max_exponent10 = 38;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static paddle::platform::bfloat16(min)() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x007f);
+  }
+  static paddle::platform::bfloat16 lowest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
+  }
+  static paddle::platform::bfloat16(max)() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
+  }
+  static paddle::platform::bfloat16 epsilon() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
+  }
+  static paddle::platform::bfloat16 round_error() {
+    return paddle::platform::bfloat16(0.5);
+  }
+  static paddle::platform::bfloat16 infinity() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
+  }
+  static paddle::platform::bfloat16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
+  }
+  static paddle::platform::bfloat16 signaling_NaN() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xff81);
+  }
+  static paddle::platform::bfloat16 denorm_min() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x0001);
+  }
+};
+
+}  // namespace std
+
+namespace Eigen {
+
+using bfloat16 = paddle::platform::bfloat16;
+
+template <>
+struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  HOSTDEVICE static inline bfloat16 epsilon() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
+  }
+  HOSTDEVICE static inline bfloat16 dummy_precision() {
+    return bfloat16(1e-5f);
+  }
+  HOSTDEVICE static inline bfloat16 highest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
+  }
+  HOSTDEVICE static inline bfloat16 lowest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
+  }
+  HOSTDEVICE static inline bfloat16 infinity() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
+  }
+  HOSTDEVICE static inline bfloat16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
+  }
+};
+namespace numext {
+
+template <>
+HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline bfloat16 exp(const bfloat16& a) {
+  return bfloat16(::expf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 erf(const bfloat16& a) {
+  return bfloat16(::erff(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 log(const bfloat16& a) {
+  return bfloat16(::logf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) {
+  return bfloat16(::tanhf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) {
+  return bfloat16(::sqrtf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) {
+  return bfloat16(::ceilf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 floor(const bfloat16& a) {
+  return bfloat16(::floorf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 round(const bfloat16& a) {
+  return bfloat16(::roundf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(::powf(static_cast<float>(a), static_cast<float>(b)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 abs(const bfloat16& a) {
+  return bfloat16(::fabs(static_cast<float>(a)));
+}
+
+}  // namespace numext
+}  // namespace Eigen
--- a/paddle/fluid/platform/bfloat16_test.cc
+++ b/paddle/fluid/platform/bfloat16_test.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/bfloat16.h"
+
+#include <vector>
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/init.h"
+
+namespace paddle {
+namespace platform {
+
+using bfloat16 = paddle::platform::bfloat16;
+
+TEST(bfloat16, conversion_cpu) {
+  // Conversion from float
+  EXPECT_EQ(bfloat16(1.0f).x, 0x3f80);
+  EXPECT_EQ(bfloat16(0.5f).x, 0x3f00);
+  EXPECT_EQ(bfloat16(0.33333f).x, 0x3eaa);
+  EXPECT_EQ(bfloat16(0.0f).x, 0x0000);
+  EXPECT_EQ(bfloat16(-0.0f).x, 0x8000);
+  EXPECT_EQ(bfloat16(65504.0f).x, 0x477f);
+  EXPECT_EQ(bfloat16(65536.0f).x, 0x4780);
+
+  // Conversion from double
+  EXPECT_EQ(bfloat16(1.0).x, 0x3f80);
+  EXPECT_EQ(bfloat16(0.5).x, 0x3f00);
+  EXPECT_EQ(bfloat16(0.33333).x, 0x3eaa);
+  EXPECT_EQ(bfloat16(0.0).x, 0x0000);
+  EXPECT_EQ(bfloat16(-0.0).x, 0x8000);
+  EXPECT_EQ(bfloat16(65504.0).x, 0x477f);
+  EXPECT_EQ(bfloat16(65536.0).x, 0x4780);
+
+  // Conversion from int
+  EXPECT_EQ(bfloat16(-1).x, 0xbf80);
+  EXPECT_EQ(bfloat16(0).x, 0x0000);
+  EXPECT_EQ(bfloat16(1).x, 0x3f80);
+  EXPECT_EQ(bfloat16(2).x, 0x4000);
+  EXPECT_EQ(bfloat16(3).x, 0x4040);
+
+  // Conversion from bool
+  EXPECT_EQ(bfloat16(true).x, 0x3f80);
+  EXPECT_EQ(bfloat16(false).x, 0x0000);
+
+  // Assignment operator
+  bfloat16 v_assign;
+  v_assign = bfloat16(0.f);
+  EXPECT_EQ(v_assign.x, 0x0000);
+  v_assign = 0.5f;
+  EXPECT_EQ(v_assign.x, 0x3f00);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3eaa);
+  v_assign = -1;
+  EXPECT_EQ(v_assign.x, 0xbf80);
+
+  // Conversion operator
+  EXPECT_EQ(static_cast<float>(bfloat16(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(bfloat16(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(bfloat16(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(bfloat16(true)), true);
+}
+
+TEST(bfloat16, arithmetic_cpu) {
+  EXPECT_NEAR(static_cast<float>(bfloat16(1) + bfloat16(1)), 2, 0.001);
+  EXPECT_EQ(static_cast<float>(bfloat16(5) + bfloat16(-5)), 0);
+  EXPECT_NEAR(static_cast<float>(bfloat16(0.33333f) + bfloat16(0.66667f)), 1.0f,
+              0.01);
+  EXPECT_EQ(static_cast<float>(bfloat16(3) - bfloat16(5)), -2);
+  EXPECT_NEAR(static_cast<float>(bfloat16(0.66667f) - bfloat16(0.33333f)),
+              0.33334f, 0.01);
+  EXPECT_NEAR(static_cast<float>(bfloat16(3.3f) * bfloat16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(static_cast<float>(bfloat16(-2.1f) * bfloat16(-3.0f)), 6.3f, 0.1);
+  EXPECT_NEAR(static_cast<float>(bfloat16(2.0f) / bfloat16(3.0f)), 0.66667f,
+              0.01);
+  EXPECT_EQ(static_cast<float>(bfloat16(1.0f) / bfloat16(2.0f)), 0.5f);
+  EXPECT_EQ(static_cast<float>(-bfloat16(512.0f)), -512.0f);
+  EXPECT_EQ(static_cast<float>(-bfloat16(-512.0f)), 512.0f);
+}
+
+TEST(bfloat16, comparison_cpu) {
+  EXPECT_TRUE(bfloat16(1.0f) == bfloat16(1.0f));
+  EXPECT_FALSE(bfloat16(-1.0f) == bfloat16(-0.5f));
+  EXPECT_TRUE(bfloat16(1.0f) != bfloat16(0.5f));
+  EXPECT_FALSE(bfloat16(-1.0f) != bfloat16(-1.0f));
+  EXPECT_TRUE(bfloat16(1.0f) < bfloat16(2.0f));
+  EXPECT_FALSE(bfloat16(-1.0f) < bfloat16(-1.0f));
+  EXPECT_TRUE(bfloat16(1.0f) <= bfloat16(1.0f));
+  EXPECT_TRUE(bfloat16(2.0f) > bfloat16(1.0f));
+  EXPECT_FALSE(bfloat16(-2.0f) > bfloat16(-2.0f));
+  EXPECT_TRUE(bfloat16(2.0f) >= bfloat16(2.0f));
+}
+
+TEST(bfloat16, lod_tensor_cpu) {
+  framework::LoDTensor lod_tensor;
+
+  std::vector<bfloat16> input_data = {bfloat16(1.0f), bfloat16(0.5f),
+                                      bfloat16(0.33333f), bfloat16(0.0f)};
+  EXPECT_EQ(input_data[0].x, 0x3f80);
+  EXPECT_EQ(input_data[1].x, 0x3f00);
+  EXPECT_EQ(input_data[2].x, 0x3eaa);
+  EXPECT_EQ(input_data[3].x, 0x0000);
+
+  lod_tensor.Resize({4, 1});
+  lod_tensor.set_lod(framework::LoD({{0, 2, 4}}));
+  bfloat16* data_ptr = lod_tensor.mutable_data<bfloat16>(CPUPlace());
+
+  EXPECT_NE(data_ptr, nullptr);
+  EXPECT_EQ(input_data.size(), static_cast<size_t>(lod_tensor.numel()));
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    data_ptr[i] = input_data[i];
+    EXPECT_EQ(data_ptr[i].x, input_data[i].x);
+  }
+}
+
+TEST(bfloat16, floating) {
+  // compile time assert.
+  PADDLE_ENFORCE_EQ(
+      std::is_floating_point<bfloat16>::value, true,
+      platform::errors::Fatal("std::is_floating_point with bfloat16 data type "
+                              "should be equal to true but it is not"));
+}
+
+TEST(bfloat16, print) {
+  bfloat16 a = bfloat16(1.0f);
+  std::cout << a << std::endl;
+}
+
+// CPU test
+TEST(bfloat16, isinf) {
+  bfloat16 a;
+  a.x = 0x7f80;
+  bfloat16 b = bfloat16(INFINITY);
+  bfloat16 c = static_cast<bfloat16>(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(c), true);
+}
+
+TEST(bfloat16, isnan) {
+  bfloat16 a;
+  a.x = 0x7fff;
+  bfloat16 b = bfloat16(NAN);
+  bfloat16 c = static_cast<bfloat16>(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(c), true);
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -38,14 +38,15 @@ extern void *cublas_dso_handle;
 */
 #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
  struct DynLoad__##__name {                                                 \
-    using FUNC_TYPE = decltype(&::__name);                                   \
    template <typename... Args>                                              \
-    inline cublasStatus_t operator()(Args... args) {                         \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {  \
+      using cublas_func =                                                    \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);          \
      std::call_once(cublas_dso_flag, []() {                                 \
        cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
      });                                                                    \
      static void *p_##__name = dlsym(cublas_dso_handle, #__name);           \
-      return reinterpret_cast<FUNC_TYPE>(p_##__name)(args...);               \
+      return reinterpret_cast<cublas_func>(p_##__name)(args...);             \
    }                                                                        \
  };                                                                         \
  extern DynLoad__##__name __name

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -161,6 +161,12 @@ inline mkldnn::memory::data_type MKLDNNGetDataType<uint8_t>() {
  return mkldnn::memory::data_type::u8;
 }

+template <>
+inline mkldnn::memory::data_type
+MKLDNNGetDataType<paddle::platform::bfloat16>() {
+  return mkldnn::memory::data_type::bf16;
+}
+
 inline void Reorder(mkldnn::memory src, mkldnn::memory dst,
                    const mkldnn::engine& engine) {
  auto reorder_prim = mkldnn::reorder(src, dst);

--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -59,6 +59,7 @@ void BindGenerator(py::module* m_ptr) {
      .def_property("_is_init_py", &framework::Generator::GetIsInitPy,
                    &framework::Generator::SetIsInitPy);
  m.def("default_cpu_generator", &framework::DefaultCPUGenerator);
-}  // end Generator
-}  // end namespace pybind
+  m.def("default_cuda_generator", &framework::GetDefaultCUDAGenerator);
+}
+}  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "pybind11/numpy.h"
@@ -104,6 +105,7 @@ struct ValidDTypeToPyArrayChecker {
  }

 DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::float16);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::bfloat16);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(float);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(double);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(bool);
@@ -119,6 +121,9 @@ inline std::string TensorDTypeToPyDTypeStr(
  if (type == proto_type) {                                                 \
    if (std::is_same<T, platform::float16>::value) {                        \
      return "e";                                                           \
+    } else if (std::is_same<T, platform::bfloat16>::value) {                \
+      /* NumPy character code of uint16 due to no support for bfloat16 */   \
+      return "H";                                                           \
    } else {                                                                \
      constexpr auto kIsValidDType = ValidDTypeToPyArrayChecker<T>::kValue; \
      PADDLE_ENFORCE_EQ(                                                    \
@@ -262,10 +267,10 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj,
    SetTensorFromPyArrayT<paddle::platform::float16, P>(self, array, place,
                                                        zero_copy);
  } else if (py::isinstance<py::array_t<uint16_t>>(array)) {
-    // TODO(cql): temporary keeping uint16, which is used for casting float16
-    // before. It should be depracated later.
-    SetTensorFromPyArrayT<paddle::platform::float16, P>(self, array, place,
-                                                        zero_copy);
+    // since there is still no support for bfloat16 in NumPy,
+    // uint16 is used for casting bfloat16
+    SetTensorFromPyArrayT<paddle::platform::bfloat16, P>(self, array, place,
+                                                         zero_copy);
  } else if (py::isinstance<py::array_t<bool>>(array)) {
    SetTensorFromPyArrayT<bool, P>(self, array, place, zero_copy);
  } else {
@@ -479,6 +484,8 @@ inline framework::Tensor *_sliceTensor(const framework::Tensor &self,
  switch (src_type) {
    case framework::proto::VarType::FP16:
      return _sliceAndConcat<paddle::platform::float16>(self, obj, dim);
+    case framework::proto::VarType::BF16:
+      return _sliceAndConcat<paddle::platform::bfloat16>(self, obj, dim);
    case framework::proto::VarType::FP32:
      return _sliceAndConcat<float>(self, obj, dim);
    case framework::proto::VarType::FP64:

--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -29,8 +29,10 @@ function(train_test TARGET_NAME)
                PROPERTIES DEPENDS test_${TARGET_NAME})
        set_tests_properties(test_train_${TARGET_NAME}${arg}
                PROPERTIES LABELS "RUN_TYPE=DIST")
-        set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES TIMEOUT 150)
+        if(NOT WIN32 AND NOT APPLE)
+            set_tests_properties(test_train_${TARGET_NAME}${arg}
+                    PROPERTIES TIMEOUT 150)
+        endif()
    endforeach()
 endfunction(train_test)


--- a/paddle/http.log
+++ b/paddle/http.log
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -20,13 +20,12 @@ rem       Paddle CI Task On Windows Platform
 rem =================================================

 rem -------clean up environment-----------
+wmic process where name="op_function_generator.exe" call terminate  2>NUL
 set work_dir=%cd%
-if exist build rmdir build /s/q
 mkdir build
 cd /d build
 tree .
 dir paddle\fluid\pybind\Release
-taskkill /f /im op_function_generator.exe  2>NUL

 rem ------initialize the virtual environment------
 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
@@ -216,7 +215,7 @@ pip install -U %PADDLE_WHL_FILE_WIN% --user
 if %ERRORLEVEL% NEQ 0 (
    call paddle_winci\Scripts\deactivate.bat 2>NUL
    echo pip install whl package failed!
-    exit /b 3
+    exit /b 1
 )

 python %work_dir%\paddle\scripts\installation_validate.py
@@ -225,7 +224,7 @@ goto:eof
 :test_whl_pacakage_error
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 echo Test import paddle failed, will exit!
-exit /b 3
+exit /b 1

 rem ---------------------------------------------------------------------------------------------
 :unit_test
@@ -248,6 +247,9 @@ goto:eof

 :unit_test_error
 call paddle_winci\Scripts\deactivate.bat 2>NUL
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
+set end=%end:~4,10%
+call :timestamp "%start%" "%end%" "TestCases Total"
 echo Running unit tests failed, will exit!
 exit /b 8

@@ -268,7 +270,7 @@ goto:eof
 :test_inference_error
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 echo Testing fluid library for inference failed!
-exit /b 5
+exit /b 1

 rem ---------------------------------------------------------------------------------------------
 :check_change_of_unittest
@@ -399,7 +401,7 @@ taskkill /f /im git-remote-https.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
-taskkill /f /im op_function_generator.exe  2>NUL
+wmic process where name="op_function_generator.exe" call terminate 2>NUL
 taskkill /f /im python.exe  2>NUL
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 taskkill /f /im python.exe  2>NUL

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -273,7 +273,7 @@ function cmake_gen() {
 function abort(){
    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
    echo "Please use pre-commit to check what is wrong." 1>&2
-    exit 1
+    exit 4
 }

 function check_style() {
@@ -303,7 +303,7 @@ function check_style() {
    
    if [ $commit_files == 'off' ];then
        echo "code format error"
-        exit 1
+        exit 4
    fi
    trap : 0
 }
@@ -528,6 +528,7 @@ EOF
        elif [ "$1" == "cp37-cp37m" ]; then
            pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
        fi
+        set +e
        ut_startTime_s=`date +%s`
        ctest --output-on-failure -j $2;mactest_error=$?
        ut_endTime_s=`date +%s`
@@ -959,7 +960,7 @@ set +x
                    
                    retry_unittests_record="$retry_unittests_record$failed_test_lists"
                    failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
-                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(\w+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
                    echo "========================================="
                    echo "This is the ${exec_time_array[$exec_times]} time to re-run"
                    echo "========================================="
@@ -1395,24 +1396,49 @@ function example() {
    fi
 }

+function summary_check_problems() {
+    set +x
+    local check_style_code=$1
+    local example_code=$2
+    if [ $check_style_code -ne 0 -o $example_code -ne 0 ];then
+      echo "========================================"
+      echo "summary problems:"
+      echo "========================================"
+      if [ $check_style_code -ne 0 ];then
+        echo "- Check code style failed! Please check the log and fix problems."
+      fi
+      if [ $example_code -ne 0 ];then
+        echo "- Check example code failed! Please check the log and fix problems."
+      fi
+      [ $check_style_code -ne 0 ] && exit $check_style_code
+      [ $example_code -ne 0 ] && exit $example_code
+    fi
+    set -x
+}
+
 function main() {
    local CMD=$1 
    local parallel_number=$2
    init
    if [ "$CMD" != "assert_file_approvals" ];then
      python ${PADDLE_ROOT}/tools/summary_env.py
+      bash ${PADDLE_ROOT}/tools/get_cpu_info.sh
    fi
    case $CMD in
      build_only)
        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
        ;;
      build_and_check)
-        check_style
+        set +e
+        $(check_style >&2)
+        check_style_code=$?
        generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
        check_sequence_op_unittest
        generate_api_spec ${PYTHON_ABI:-""} "PR"
-        example
+        $(example >&2)
+        example_code=$?
+        summary_check_problems $check_style_code $example_code
        assert_api_spec_approvals
        ;;
      build)

--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -217,6 +217,8 @@ from .tensor.search import index_select  #DEFINE_ALIAS
 from .tensor.search import nonzero  #DEFINE_ALIAS
 from .tensor.search import sort  #DEFINE_ALIAS
 from .framework.random import manual_seed  #DEFINE_ALIAS
+from .framework.random import get_cuda_rng_state  #DEFINE_ALIAS
+from .framework.random import set_cuda_rng_state  #DEFINE_ALIAS
 from .framework import Variable  #DEFINE_ALIAS
 from .framework import ParamAttr  #DEFINE_ALIAS
 from .framework import create_global_var  #DEFINE_ALIAS
@@ -230,6 +232,7 @@ from .framework import grad  #DEFINE_ALIAS
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
+from .framework import SaveLoadConfig  #DEFINE_ALIAS
 from .framework import DataParallel  #DEFINE_ALIAS

 from .framework import NoamDecay  #DEFINE_ALIAS
@@ -259,7 +262,7 @@ from .device import get_device
 from .fluid.dygraph.base import enable_dygraph as disable_static  #DEFINE_ALIAS
 from .fluid.dygraph.base import disable_dygraph as enable_static  #DEFINE_ALIAS
 from .fluid.framework import in_dygraph_mode as in_dynamic_mode  #DEFINE_ALIAS
-from .fluid.dygraph.base import no_grad  #DEFINE_ALIAS
+from .fluid.dygraph.base import no_grad_ as no_grad  #DEFINE_ALIAS

 from . import jit
 from . import static
@@ -267,5 +270,6 @@ from . import static
 # high-level api
 from .hapi import Model
 from .hapi import callbacks
+from .hapi import summary
 import paddle.text
 import paddle.vision
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -73,20 +73,21 @@ def broadcast(tensor, src, group=0):
    Examples:
        .. code-block:: python

-        import paddle
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data = np.array([[4, 5, 6], [4, 5, 6]])
-        else:
-            np_data = np.array([[1, 2, 3], [1, 2, 3]])
-        data = paddle.to_tensor(np_data)
-        paddle.distributed.broadcast(data, 1)
-        out = data.numpy()
-        # [[1, 2, 3], [1, 2, 3]]
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data = np.array([[4, 5, 6], [4, 5, 6]])
+            else:
+                np_data = np.array([[1, 2, 3], [1, 2, 3]])
+            data = paddle.to_tensor(np_data)
+            paddle.distributed.broadcast(data, 1)
+            out = data.numpy()
+            # [[1, 2, 3], [1, 2, 3]]
    """
    if in_dygraph_mode():
        return core.ops.c_broadcast(tensor, tensor, 'root', src,
@@ -129,21 +130,22 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
    Examples:
        .. code-block:: python

-        import paddle
-        from paddle.distributed import ReduceOp
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data = np.array([[4, 5, 6], [4, 5, 6]])
-        else:
-            np_data = np.array([[1, 2, 3], [1, 2, 3]])
-        data = paddle.to_tensor(np_data)
-        paddle.distributed.all_reduce(data)
-        out = data.numpy()
-        # [[5, 7, 9], [5, 7, 9]]
+            import numpy as np
+            import paddle
+            from paddle.distributed import ReduceOp
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data = np.array([[4, 5, 6], [4, 5, 6]])
+            else:
+                np_data = np.array([[1, 2, 3], [1, 2, 3]])
+            data = paddle.to_tensor(np_data)
+            paddle.distributed.all_reduce(data)
+            out = data.numpy()
+            # [[5, 7, 9], [5, 7, 9]]
    """
    if in_dygraph_mode():
        if op == ReduceOp.SUM:
@@ -204,20 +206,21 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
    Examples:
        .. code-block:: python

-        import paddle
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data = np.array([[4, 5, 6], [4, 5, 6]])
-        else:
-            np_data = np.array([[1, 2, 3], [1, 2, 3]])
-        data = paddle.to_tensor(np_data)
-        paddle.distributed.reduce(data, 0)
-        out = data.numpy()
-        # [[5, 7, 9], [5, 7, 9]]
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data = np.array([[4, 5, 6], [4, 5, 6]])
+            else:
+                np_data = np.array([[1, 2, 3], [1, 2, 3]])
+            data = paddle.to_tensor(np_data)
+            paddle.distributed.reduce(data, 0)
+            out = data.numpy()
+            # [[5, 7, 9], [5, 7, 9]]
    """
    if in_dygraph_mode():
        if op == ReduceOp.SUM:
@@ -286,25 +289,26 @@ def all_gather(tensor_list, tensor, group=0):
    Examples:
        .. code-block:: python

-        import paddle
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        tensor_list = []
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data1 = np.array([[4, 5, 6], [4, 5, 6]])
-            np_data2 = np.array([[4, 5, 6], [4, 5, 6]])
-            data1 = paddle.to_tensor(np_data1)
-            data2 = paddle.to_tensor(np_data2)
-            paddle.distributed.all_gather(tensor_list, data1)
-        else:
-            np_data1 = np.array([[1, 2, 3], [1, 2, 3]])
-            np_data2 = np.array([[1, 2, 3], [1, 2, 3]])
-            data1 = paddle.to_tensor(np_data1)
-            data2 = paddle.to_tensor(np_data2)
-            out = paddle.distributed.all_gather(tensor_list, data2)
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            tensor_list = []
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data1 = np.array([[4, 5, 6], [4, 5, 6]])
+                np_data2 = np.array([[4, 5, 6], [4, 5, 6]])
+                data1 = paddle.to_tensor(np_data1)
+                data2 = paddle.to_tensor(np_data2)
+                paddle.distributed.all_gather(tensor_list, data1)
+            else:
+                np_data1 = np.array([[1, 2, 3], [1, 2, 3]])
+                np_data2 = np.array([[1, 2, 3], [1, 2, 3]])
+                data1 = paddle.to_tensor(np_data1)
+                data2 = paddle.to_tensor(np_data2)
+                paddle.distributed.all_gather(tensor_list, data2)
    """
    op_type = 'c_allgather'
    helper = LayerHelper(op_type, **locals())
@@ -359,25 +363,26 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
    Examples:
        .. code-block:: python

-        import paddle
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data1 = np.array([7, 8, 9])
-            np_data2 = np.array([10, 11, 12])
-        else:
-            np_data1 = np.array([1, 2, 3])
-            np_data2 = np.array([4, 5, 6])
-        data1 = paddle.to_tensor(np_data1)
-        data2 = paddle.to_tensor(np_data2)
-        if paddle.ParallelEnv().local_rank == 0:
-            paddle.distributed.scatter(data1, src=1)
-        else:
-            paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
-        out = data1.numpy()
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data1 = np.array([7, 8, 9])
+                np_data2 = np.array([10, 11, 12])
+            else:
+                np_data1 = np.array([1, 2, 3])
+                np_data2 = np.array([4, 5, 6])
+            data1 = paddle.to_tensor(np_data1)
+            data2 = paddle.to_tensor(np_data2)
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                paddle.distributed.scatter(data1, src=1)
+            else:
+                paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
+            out = data1.numpy()
    """
    op_type = 'c_scatter'
    global _default_group
@@ -425,13 +430,13 @@ def barrier(group=0):
    Examples:
        .. code-block:: python

-        import paddle
-        import paddle.prepare_context as prepare_context
+            import paddle
+            from paddle.distributed import init_parallel_env

-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        paddle.distributed.barrier()
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            paddle.distributed.barrier()
    """
    op_type = 'barrier'
    temp = paddle.fill_constant([1], dtype="int32", value="1")

--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -50,3 +50,10 @@ distributed_optimizer = fleet.distributed_optimizer
 save_inference_model = fleet.save_inference_model
 save_persistables = fleet.save_persistables
 minimize = fleet.minimize
+distributed_model = fleet.distributed_model
+step = fleet.step
+clear_grad = fleet.clear_grad
+set_lr = fleet.set_lr
+get_lr = fleet.get_lr
+state_dict = fleet.state_dict
+set_state_dict = fleet.set_state_dict
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -118,7 +118,7 @@ class DistributedStrategy(object):
            strategy = fleet.DistributedStrategy()
            strategy.dgc = True
            strategy.recompute = True
-            strategy.recompute_configs = {"checkpoint": ["x"]}
+            strategy.recompute_configs = {"checkpoints": ["x"]}
            strategy.save_to_prototxt("dist_strategy.prototxt")
        """
        with open(output, "w") as fout:
@@ -133,7 +133,7 @@ class DistributedStrategy(object):

            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
-            strategy.load_from_prototxt("dist_strategy.protoxt")
+            strategy.load_from_prototxt("dist_strategy.prototxt")
        """
        with open(pb_file, 'r') as f:
            self.strategy = google.protobuf.text_format.Merge(
@@ -147,6 +147,7 @@ class DistributedStrategy(object):
        Examples:
          .. code-block:: python

+            import paddle
            exe_strategy = paddle.fluid.ExecutionStrategy()
            exe_strategy.num_threads = 10
            exe_strategy.num_iteration_per_drop_scope = 10
@@ -179,6 +180,7 @@ class DistributedStrategy(object):
        Examples:
          .. code-block:: python

+            import paddle
            build_strategy = paddle.fluid.BuildStrategy()
            build_strategy.enable_sequential_execution = True
            build_strategy.fuse_elewise_add_act_ops = True
@@ -252,14 +254,19 @@ class DistributedStrategy(object):
        a dict.

        **Notes**:
-            **Detailed arguments for a_sync_configs**
-            **k_step**: number of local optimization updates before communication
-            **max_merge_var_num**: maximum number of merged gradients before communication
-            **send_queue_size**: a buffer size of worker communication
-            **independent_recv_thread**: if we are using independent recv thread for communication
-            **thread_pool_size**: number of thread pool
-            **send_wait_times**: waiting time for sending gradients
-            **runtime_split_send_recv**: if we are using Tensor split for send and recv during runtime
+            k_step(int): number of local optimization updates before communication
+
+            max_merge_var_num(int): maximum number of merged gradients before communication
+
+            send_queue_size(int): a buffer size of worker communication
+
+            independent_recv_thread(bool): if we are using independent recv thread for communication
+
+            thread_pool_size(int): number of thread pool
+
+            send_wait_times(int): waiting time for sending gradients
+
+            runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime

        Examples:
          .. code-block:: python
@@ -270,11 +277,12 @@ class DistributedStrategy(object):

            strategy = fleet.DistributedStrategy()
            strategy.a_sync = True  # by default this is True
-            configs = {"k_step": 10000, "send_queue_size": 32}
+            configs = {"k_steps": 1024, "send_queue_size": 32}
            strategy.a_sync_configs = configs

            # code block for defining loss and local optimizer
            # sgd = fleet.distributed_optimizer(optimizer, strategy)
+
        """
        return get_msg_dict(self.strategy.a_sync_configs)

@@ -314,14 +322,21 @@ class DistributedStrategy(object):
        settings that can be configured through a dict.

        **Notes**:
-            **init_loss_scaling(float)**: The initial loss scaling factor. Default 32768.
-            **use_dynamic_loss_scaling(bool)**: Whether to use dynamic loss scaling. Default True.
-            **incr_every_n_steps(int)**: Increases loss scaling every n consecutive steps with finite gradients. Default 1000.
-            **decr_every_n_nan_or_inf(int)**: Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.
-            **incr_ratio(float)**: The multiplier to use when increasing the loss scaling. Default 2.0.
-            **decr_ratio(float)**: The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.
-            **custom_white_list(list[str])**: Users' custom white list which always execution fp16.
-            **custom_black_list(list[str])**: Users' custom black list which forbidden execution fp16.
+            init_loss_scaling(float): The initial loss scaling factor. Default 32768.
+
+            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. Default True.
+
+            incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients. Default 1000.
+
+            decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.
+
+            incr_ratio(float): The multiplier to use when increasing the loss scaling. Default 2.0.
+
+            decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.
+
+            custom_white_list(list[str]): Users' custom white list which always execution fp16.
+
+            custom_black_list(list[str]): Users' custom black list which forbidden execution fp16.

        Examples:
          .. code-block:: python
@@ -553,7 +568,7 @@ class DistributedStrategy(object):
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.recompute = True
-            strategy.recompute_configs = {"checkpionts": ["x", "y"]}
+            strategy.recompute_configs = {"checkpoints": ["x", "y"]}

        """
        return get_msg_dict(self.strategy.recompute_configs)
@@ -603,6 +618,7 @@ class DistributedStrategy(object):

        **Notes**:
            **Detailed arguments for pipeline_configs**
+
            **micro_batch**: the number of small batches in each user defined batch

        Examples:
@@ -626,10 +642,10 @@ class DistributedStrategy(object):
    @property
    def localsgd(self):
        """
-        Indicating whether we are using Local SGD training. For more details, please refer to
-        [Don't Use Large Mini-Batches, Use Local SGD](https://arxiv.org/pdf/1808.07217.pdf),
+        Indicating whether we are using Local SGD training. Default Value: False
+        For more details, please refer to
+        `Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_.

-        Default Value: False

        Examples:
          .. code-block:: python
@@ -655,13 +671,12 @@ class DistributedStrategy(object):
        setting that can be configured through a dict.

        **Notes**:
-            **k_steps(int)**: The local steps for training before parameter
-                synchronization. Default 1. If strategy.auto is set True, the
-                local steps will be calculated automatically during training.
-                The algorithm is referenced in this paper: 
-                [Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD](https://arxiv.org/pdf/1810.08313.pdf).
-                In this case, k_steps indicates the first local steps which
-                is suggested setting to 1.
+            k_steps(int) The local steps for training before parameter synchronization. Default 1.
+
+            If strategy.auto is set True, the local steps will be calculated automatically during training.
+            The algorithm is referenced in this paper: 
+            `Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
+            In this case, k_steps indicates the first local steps which is suggested setting to 1.

        Examples:
          .. code-block:: python
@@ -712,14 +727,16 @@ class DistributedStrategy(object):
        settings that can be configured through a dict.

        **Notes**:
-            **rampup_begin_step(int)**: The beginning step from which gradient compression is implemented. Default 0.
-            **rampup_step(int)**: Time steps used in sparsity warm-up periods. Default is 1.
-                For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100,
-                it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array
-                ends, it will use 0.999 then and after.
-            **sparsity(list[float])**: Get top important element from gradient tensor, the ratio is (1 - sparsity).
-                Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important
-                element will be transmitted.
+            rampup_begin_step(int): The beginning step from which gradient compression is implemented. Default 0.
+
+            rampup_step(int): Time steps used in sparsity warm-up periods. Default is 1. \
+                    For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100, \
+                    it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array \
+                    ends, it will use 0.999 then and after.
+
+            sparsity(list[float]): Get top important element from gradient tensor, the ratio is (1 - sparsity). \
+                    Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important \
+                    element will be transmitted.

        Examples:
          .. code-block:: python
@@ -749,7 +766,8 @@ class DistributedStrategy(object):
        to model parameters.

        Examples:
-        .. code-block:: python
+          .. code-block:: python
+
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.gradient_merge = True
@@ -768,11 +786,15 @@ class DistributedStrategy(object):
    def gradient_merge_configs(self):
        """
        the key-value configs of distribute_strategy
-        Keys: 
-            k_steps (int): the update period of the parameters
-            avg (bool): whether to average the gradients of each mini-batch,
-                the default value is `True`
-        Example:
+
+        **Note**:
+            k_steps(int): the update period of the parameters.
+
+            avg(bool): whether to average the gradients of each mini-batch, the default value is `True`
+
+        Examples:
+          .. code-block:: python
+
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.gradient_merge = True
@@ -826,6 +848,7 @@ class DistributedStrategy(object):

        Examples:
          .. code-block:: python
+
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.lars = True
@@ -882,6 +905,7 @@ class DistributedStrategy(object):

        Examples:
          .. code-block:: python
+
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.lamb = True

--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -232,6 +232,8 @@ class PaddleCloudRoleMaker(RoleMakerBase):
        self._node_type_comm = None
        self._all_comm = None

+        self._non_distributed = False
+
        if not self._is_collective:
            self._hdfs_name = kwargs.get("hdfs_name", "")
            self._hdfs_ugi = kwargs.get("hdfs_ugi", "")
@@ -373,6 +375,15 @@ class PaddleCloudRoleMaker(RoleMakerBase):
            self.generate_role()
        return self._server_endpoints

+    def _is_non_distributed(self):
+        """
+        Return True if indispensable environment for fleetrun is not found
+        (use python-run to launch fleet-code directly)
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._non_distributed
+
    def _heter_worker_num(self):
        """
        get heter worker nums
@@ -409,13 +420,22 @@ class PaddleCloudRoleMaker(RoleMakerBase):
        try:
            # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
            # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
-            self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST",
-                                               "").split(",")
-            assert self._server_endpoints != ""
+            self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST")
            self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                               "").split(",")
-            assert self._server_endpoints != ""
-
+            if self._server_endpoints is None:
+                # back to non_distributed execution.
+                self._server_endpoints = ""
+                self._trainers_num = 1
+                self._role = Role.WORKER
+                self._current_id = 0
+                self._node_num = 1
+                self._heter_trainers_num = 0
+                self._heter_trainer_endpoints = None
+                self._non_distributed = True
+                return
+
+            self._server_endpoints = self._server_endpoints.split(",")
            trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
            training_role = os.environ["TRAINING_ROLE"]

@@ -488,7 +508,11 @@ class PaddleCloudRoleMaker(RoleMakerBase):
        assert (self._training_role == "TRAINER")
        self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
-        assert self._worker_endpoints is not None, "can't find PADDLE_TRAINER_ENDPOINTS"
+        if self._worker_endpoints is None:
+            # back to non_distributed execution.
+            self._worker_endpoints = "127.0.0.1:6170"
+            self._cur_endpoint = self._worker_endpoints
+            self._non_distributed = True
        self._worker_endpoints = self._worker_endpoints.split(",")
        self._trainers_num = len(self._worker_endpoints)
        self._node_num = len(

--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -200,11 +200,11 @@ def launch_collective(args):
        start_port = os.environ.get('FLAGS_START_PORT')
    if cloud_utils.use_paddlecloud() and trainers_num != 1:
        cluster, pod = cloud_utils.get_cloud_cluster(args.ips, gpus, start_port)
-        logger.info("get cluster from cloud:{}".format(cluster))
+        logger.debug("get cluster from cloud:{}".format(cluster))
    else:
        # trainers_num = 1 or not use paddlecloud ips="a,b"
        cluster, pod = get_cluster_from_args(args, gpus)
-        logger.info("get cluster from args:{}".format(cluster))
+        logger.debug("get cluster from args:{}".format(cluster))

    procs = start_local_trainers(
        cluster,
@@ -217,7 +217,8 @@ def launch_collective(args):
        alive = watch_local_trainers(procs, cluster.trainers_nranks())

        if not alive:
-            logger.info("Local procs complete, POD info:{}".format(pod))
+            logger.info("Local processes completed.")
+            logger.debug("POD info:{}".format(pod))
            break

        time.sleep(3)
@@ -313,18 +314,26 @@ def launch_ps(args):
    cmds = []
    log_fns = []
    for idx, cur_server in enumerate(pod.servers):
-        current_env.update({
+        proc_env = {
            "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
            "PADDLE_PORT": cur_server.endpoint.split(":")[1],
            "TRAINING_ROLE": "PSERVER",
            "PADDLE_TRAINERS_NUM": str(worker_num),
            "POD_IP": cur_server.endpoint.split(":")[0]
-        })
+        }
+        current_env.update(proc_env)

        cmd = [sys.executable, "-u", args.training_script
               ] + args.training_script_args
        cmds.append(cmd)

+        if idx == 0:
+            logger.info(
+                "Local server start {} processes. First process distributed "
+                "environment info (Only For Debug): {}".format(
+                    len(pod.servers),
+                    pretty_print_envs(proc_env, ("Distributed Envs", "Value"))))
+
        if args.log_dir is not None:
            os.system("mkdir -p {}".format(args.log_dir))
            fn = open("%s/serverlog.%d" % (args.log_dir, idx), "w")
@@ -338,21 +347,32 @@ def launch_ps(args):
        tp.rank = cur_server.rank
        tp.local_rank = idx
        tp.log_fn = fn
-        tp.log_offset = 0 if fn else None
+        tp.log_offset = fn.tell() if fn else None
        tp.cmd = cmd

        procs.append(tp)

    for idx, cur_worker in enumerate(pod.workers):
-        current_env.update({
+        proc_env = {
            "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
+            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
            "PADDLE_TRAINERS_NUM": str(worker_num),
            "TRAINING_ROLE": "TRAINER",
            "PADDLE_TRAINER_ID": str(cur_worker.rank)
-        })
+        }
+        current_env.update(proc_env)
+
        cmd = [sys.executable, "-u", args.training_script
               ] + args.training_script_args
        cmds.append(cmd)
+
+        if idx == 0:
+            logger.info(
+                "Local worker start {} processes. First process distributed "
+                "environment info (Only For Debug): {}".format(
+                    len(pod.workers),
+                    pretty_print_envs(proc_env, ("Distributed Envs", "Value"))))
+
        if args.log_dir is not None:
            os.system("mkdir -p {}".format(args.log_dir))
            fn = open("%s/workerlog.%d" % (args.log_dir, idx), "w")
@@ -366,11 +386,14 @@ def launch_ps(args):
        tp.rank = cur_worker.rank
        tp.local_rank = idx
        tp.log_fn = fn
-        tp.log_offset = 0 if fn else None
+        tp.log_offset = fn.tell() if fn else None
        tp.cmd = cmd

        procs.append(tp)

+    logger.info(
+        "Please check servers and workers logs in {}/workerlog.* and {}/serverlog.*".
+        format(args.log_dir, args.log_dir))
    # only wait worker to finish here
    for i, proc in enumerate(procs):
        if i < len(pod.servers):
@@ -403,16 +426,16 @@ def launch():
    cuda_device_num = fluid.core.get_cuda_device_count()
    if len(has_ps_args) > 0 or cuda_device_num == 0:
        logger.info(
-            "Run parameter-sever cpu mode. pserver args:{}, cuda count:{}".
+            "Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}".
            format(has_ps_args, cuda_device_num))
        launch_ps(args)
    elif len(has_collective_args) > 0:
-        logger.info("Run collective gpu mode. gpu args:{}, cuda count:{}".
+        logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}".
                    format(has_collective_args, cuda_device_num))
        launch_collective(args)
    else:
        logger.warning(
-            "Not found distinct args. Default use gpu collective mode")
+            "Not found distinct arguments. Default use gpu collective mode")
        launch_collective(args)



--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -253,7 +253,8 @@ def terminate_local_procs(procs):
    for p in procs:
        if p.proc.poll() is None:
            p.proc.terminate()
-            p.log_fn.close()
+            if p.log_fn:
+                p.log_fn.close()
            logger.debug("terminate process id:{}".format(p.proc.pid))

    #wait all process terminiated
@@ -338,6 +339,45 @@ def get_ports(num, offset):
    return ports


+def pretty_print_envs(envs, header=None):
+    spacing = 2
+    max_k = 40
+    max_v = 45
+
+    for k, v in envs.items():
+        max_k = max(max_k, len(k))
+
+    h_format = "{{:^{}s}}{}{{:<{}s}}\n".format(max_k, " " * spacing, max_v)
+    l_format = "{{:<{}s}}{{}}{{:<{}s}}\n".format(max_k, max_v)
+    length = max_k + max_v + spacing
+
+    border = "".join(["="] * length)
+    line = "".join(["-"] * length)
+
+    draws = ""
+    draws += border + "\n"
+
+    if header:
+        draws += h_format.format(header[0], header[1])
+    else:
+        draws += h_format.format("fleetrun Distributed Envs", "Value")
+
+    draws += line + "\n"
+
+    for k, v in envs.items():
+        if isinstance(v, str) and len(v) >= max_v:
+            str_v = "... " + v[-41:]
+        else:
+            str_v = v
+
+        draws += l_format.format(k, " " * spacing, str(str_v))
+
+    draws += border
+
+    _str = "\n{}\n".format(draws)
+    return _str
+
+
 class TrainerProc(object):
    def __init__(self):
        self.proc = None
@@ -373,11 +413,19 @@ def start_local_trainers(cluster,

        current_env.update(proc_env)

-        logger.debug("trainer proc env:{}".format(current_env))
-
        cmd = [sys.executable, "-u", training_script] + training_script_args

-        logger.info("start trainer proc:{} env:{}".format(cmd, proc_env))
+        logger.debug("start trainer proc{}  env:{}".format(cmd, current_env))
+
+        if idx == 0:
+            logger.info("Local start {} processes. First process distributed "
+                        "environment info (Only For Debug): {}".format(
+                            len(pod.trainers),
+                            pretty_print_envs(proc_env, ("Distributed Envs",
+                                                         "Value"))))
+            logger.info(
+                "More details for debug about commands and environments are written in {}/run.sh".
+                format(log_dir))

        fn = None
        if log_dir is not None:

--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -270,7 +270,7 @@ foreach(src ${TEST_OPS})
 endforeach()

 # setting timeout value for old unittests
-if(NOT WIN32)
+if(NOT WIN32 AND NOT APPLE)
    set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250 LABELS "RUN_TYPE=NIGHTLY")
-	  set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
+	set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
 endif()
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/tests/demo/executor_train_dataset.py
+++ b/python/paddle/fluid/tests/demo/executor_train_dataset.py
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
--- a/python/paddle/fluid/tests/demo/pipeline_train.py
+++ b/python/paddle/fluid/tests/demo/pipeline_train.py
--- a/python/paddle/fluid/tests/demo/pyreader.py
+++ b/python/paddle/fluid/tests/demo/pyreader.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
--- a/python/paddle/fluid/tests/unittests/launch_function_helper.py
+++ b/python/paddle/fluid/tests/unittests/launch_function_helper.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
--- a/python/paddle/fluid/tests/unittests/simnet_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/simnet_dataset_reader.py
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
--- a/python/paddle/fluid/tests/unittests/test_adamax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_op.py
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
--- a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
--- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
--- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_linspace.py
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
--- a/python/paddle/fluid/tests/unittests/test_normal.py
+++ b/python/paddle/fluid/tests/unittests/test_normal.py
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
--- a/python/paddle/fluid/tests/unittests/test_pool2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
--- a/python/paddle/fluid/tests/unittests/test_rand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rand_op.py
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
--- a/python/paddle/fluid/tests/unittests/test_stack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_stack_op.py
--- a/python/paddle/fluid/tests/unittests/test_translated_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_translated_layer.py
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
--- a/python/paddle/hapi/__init__.py
+++ b/python/paddle/hapi/__init__.py
--- a/python/paddle/hapi/dygraph_layer_patch.py
+++ b/python/paddle/hapi/dygraph_layer_patch.py
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
--- a/python/paddle/jit/__init__.py
+++ b/python/paddle/jit/__init__.py
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
--- a/python/paddle/optimizer/lr_scheduler.py
+++ b/python/paddle/optimizer/lr_scheduler.py
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
--- a/python/paddle/utils/plot.py
+++ b/python/paddle/utils/plot.py
--- a/python/requirements.txt
+++ b/python/requirements.txt
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
--- a/tools/get_cpu_info.sh
+++ b/tools/get_cpu_info.sh