diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 9d5c0cc7048f7db539c090d28c6184ac6d72d75a..bb5e2e1369a8478b500572106f9d11dff12e0189 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -272,7 +272,7 @@ cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatib
 
 cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
-cc_library(generator SRCS generator.cc)
+cc_library(generator SRCS generator.cc DEPS enforce place)
 
 # Get the current working branch
 execute_process(
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 3cea7a66d01051824a1de01d62c237636771804b..f757e244e38ec965d62d673e63ed082ca70c63c7 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -116,6 +116,8 @@ void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
       return platform::to_void_cast(tensor.data<unsigned char>());
     case mkldnn::memory::data_type::s32:
       return platform::to_void_cast(tensor.data<int32_t>());
+    case mkldnn::memory::data_type::bf16:
+      return platform::to_void_cast(tensor.data<paddle::platform::bfloat16>());
     default:
       PADDLE_THROW(
           platform::errors::InvalidArgument("Wrong mkldnn type provided."));
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 6eb84ef9d7c01b589cc95a78ea9727a81f6dc36e..b92c47c2eb018603e1b3156921fb2c1702864c57 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -61,7 +61,8 @@ inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) {
       {DataTypeTrait<float>::DataType(), MKLDNNDataType::f32},
       {DataTypeTrait<int8_t>::DataType(), MKLDNNDataType::s8},
       {DataTypeTrait<uint8_t>::DataType(), MKLDNNDataType::u8},
-      {DataTypeTrait<int32_t>::DataType(), MKLDNNDataType::s32}};
+      {DataTypeTrait<int32_t>::DataType(), MKLDNNDataType::s32},
+      {DataTypeTrait<platform::bfloat16>::DataType(), MKLDNNDataType::bf16}};
   auto iter = dict.find(static_cast<int>(type));
   if (iter != dict.end()) return iter->second;
   return MKLDNNDataType::undef;
@@ -74,6 +75,9 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                                const OpKernelType& expected_kernel_type,
                                const Tensor& in, Tensor* out);
+
+void* GetDataFromTensor(const Tensor& tensor, MKLDNNDataType type);
+
 #endif
 
 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc
index a0d08826b854fea9256382f0e065fd59dda8c8b3..8dfad23db65178c46140b887811846e413bebd00 100644
--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -43,3 +43,17 @@ TEST(DataTransform, DataLayoutFunction) {
   EXPECT_TRUE(in.layout() == paddle::framework::DataLayout::kNHWC);
   EXPECT_TRUE(in.dims() == paddle::framework::make_ddim({2, 3, 1, 2}));
 }
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(DataTransform, GetDataFromTensorDNNL) {
+  auto place = paddle::platform::CPUPlace();
+  paddle::framework::Tensor in = paddle::framework::Tensor();
+  in.mutable_data<paddle::platform::bfloat16>(
+      paddle::framework::make_ddim({2, 3, 1, 2}), place);
+
+  void* in_data =
+      paddle::framework::GetDataFromTensor(in, dnnl::memory::data_type::bf16);
+  EXPECT_EQ(in_data, paddle::platform::to_void_cast(
+                         in.data<paddle::platform::bfloat16>()));
+}
+#endif
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index df58193f95e2fc2f1ff7e4b7af76dd1f7c9837ef..94934629e28726d15348c5c692eaf31f7598110c 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -95,9 +95,10 @@ void DatasetImpl<T>::SetHdfsConfig(const std::string& fs_name,
                                    const std::string& fs_ugi) {
   fs_name_ = fs_name;
   fs_ugi_ = fs_ugi;
-  std::string cmd = std::string("hadoop fs");
+  std::string cmd = std::string("$HADOOP_HOME/bin/hadoop fs");
   cmd += " -D fs.default.name=" + fs_name;
   cmd += " -D hadoop.job.ugi=" + fs_ugi;
+  cmd += " -Ddfs.client.block.write.retries=15 -Ddfs.rpc.timeout=500000";
   paddle::framework::hdfs_set_command(cmd);
 }
 
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index f479d92483c1c39a0b43e0d8c514237bf89bcc00..8188d5cde1b90436d040e8b9dcc1070ac85bf319 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -18,6 +18,7 @@
 #include <unordered_map>
 
 using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 2c4a7b4d02727437742b19cc6d51e209e4346d03..720e422e114835f367317d4ba265254856885c15 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -36,15 +38,16 @@ struct DataTypeTrait<void> {
 #define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \
   callback(cpp_type, ::paddle::framework::proto::VarType::proto_type);
 
-#define _ForEachDataType_(callback)                                     \
-  _ForEachDataTypeHelper_(callback, float, FP32);                       \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \
-  _ForEachDataTypeHelper_(callback, double, FP64);                      \
-  _ForEachDataTypeHelper_(callback, int, INT32);                        \
-  _ForEachDataTypeHelper_(callback, int64_t, INT64);                    \
-  _ForEachDataTypeHelper_(callback, bool, BOOL);                        \
-  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                    \
-  _ForEachDataTypeHelper_(callback, int16_t, INT16);                    \
+#define _ForEachDataType_(callback)                                      \
+  _ForEachDataTypeHelper_(callback, float, FP32);                        \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);  \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \
+  _ForEachDataTypeHelper_(callback, double, FP64);                       \
+  _ForEachDataTypeHelper_(callback, int, INT32);                         \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                     \
+  _ForEachDataTypeHelper_(callback, bool, BOOL);                         \
+  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                     \
+  _ForEachDataTypeHelper_(callback, int16_t, INT16);                     \
   _ForEachDataTypeHelper_(callback, int8_t, INT8)
 
 #define _ForEachDataTypeSmall_(callback)           \
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 2a380201f297f42dd82a6809bef9a72660066819..331596da33acc151810cd616ea6d5bdcae333b30 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -38,3 +38,25 @@ TEST(DataType, float16) {
   std::string type = "::paddle::platform::float16";
   EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
 }
+
+TEST(DataType, bfloat16) {
+  using paddle::framework::Tensor;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::bfloat16;
+  namespace f = paddle::framework;
+  f::proto::VarType::Type dtype = f::proto::VarType::BF16;
+
+  Tensor tensor;
+  CPUPlace cpu;
+  tensor.mutable_data(cpu, dtype);
+
+  // test bf16 tensor
+  EXPECT_EQ(tensor.type(), f::ToDataType(typeid(bfloat16)));
+
+  // test bf16 size
+  EXPECT_EQ(f::SizeOfType(dtype), 2u);
+
+  // test debug info
+  std::string type = "::paddle::platform::bfloat16";
+  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
+}
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 44542f05d9d5c92f58a84dc2be59782bae2ff3aa..3d56152c237695126d2eecb0c51ebd964a85a690 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -77,6 +77,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
       framework::VisitDataType(dst_type,
                                CastDataType<platform::float16>(in, out, ctx));
       break;
+    case proto::VarType::BF16:
+      framework::VisitDataType(dst_type,
+                               CastDataType<platform::bfloat16>(in, out, ctx));
+      break;
     case proto::VarType::FP32:
       framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
       break;
diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc
index bbebea9f13fd37469a0e9b7be9719aca128f5687..ea7a665bcbe02ff382f1b3bf04ce177a674483c9 100644
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -24,6 +24,11 @@ TEST(DataTypeTransform, CPUTransform) {
       paddle::framework::DataLayout::kAnyLayout,
       paddle::framework::LibraryType::kPlain);
 
+  auto kernel_bf16 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::BF16, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
   auto kernel_fp32 = paddle::framework::OpKernelType(
       paddle::framework::proto::VarType::FP32, place,
       paddle::framework::DataLayout::kAnyLayout,
@@ -189,4 +194,120 @@ TEST(DataTypeTransform, CPUTransform) {
                 static_cast<paddle::platform::float16>(in_data_bool[i]).x);
     }
   }
+
+  // data type transform from/to bfloat16
+  {
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor out;
+
+    paddle::platform::bfloat16* ptr =
+        in.mutable_data<paddle::platform::bfloat16>(
+            paddle::framework::make_ddim({2, 3}), place);
+    int data_number = 2 * 3;
+
+    for (int i = 0; i < data_number; ++i) {
+      ptr[i] = i;
+    }
+
+    // transform from bfloat16 to other data types
+    paddle::framework::TransDataType(kernel_bf16, kernel_fp32, in, &out);
+    float* out_data_float = out.data<float>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_fp64, in, &out);
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_int32, in, &out);
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_int64, in, &out);
+    int64_t* out_data_int64 = out.data<int64_t>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_bool, in, &out);
+    bool* out_data_bool = out.data<bool>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+    }
+
+    // transform float to bfloat16
+    float* in_data_float =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_float[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_fp32, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_float[i]).x);
+    }
+
+    // transform double to bfloat16
+    double* in_data_double =
+        in.mutable_data<double>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_double[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_fp64, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_double[i]).x);
+    }
+
+    // transform int to bfloat16
+    int* in_data_int =
+        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_int32, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_int[i]).x);
+    }
+
+    // transform int64 to bfloat16
+    int64_t* in_data_int64 =
+        in.mutable_data<int64_t>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int64[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_int64, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_int64[i]).x);
+    }
+
+    // transform bool to bfloat16
+    bool* in_data_bool =
+        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_bool[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_bool, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_bool[i]).x);
+    }
+  }
 }
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 4d8bd101258664f6cafd71784ae070e0cb8b9215..a3cc4d1721e20a72817606bd773129230a8154ce 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -3,6 +3,7 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context
 
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(fetch_async_op_handle SRCS fetch_async_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 
 cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry) 
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
@@ -98,7 +99,7 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
 #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
 #        device_context reduce_op_handle )
 cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
-        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
+        DEPS fetch_async_op_handle ssa_graph_executor scope simple_threadpool device_context)
 cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
 
 cc_test(exception_holder_test SRCS exception_holder_test.cc )
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index f5ec78f44b5ebb780cc569c24ccdca6336195961..e440dff2af6b5649d34f47c3b696edeb8a1ba0a2 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -18,7 +18,8 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -120,6 +121,11 @@ FetchResultType FastThreadedSSAGraphExecutor::Run(
   }
   // Wait FetchOps.
   ClearFetchOp(graph_, &fetch_ops);
+
+  for (auto &place : places_) {
+    fetch_ctxs_.Get(place)->Wait();
+  }
+
   return fetches;
 }
 
@@ -162,8 +168,8 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(
 
     ir::Node *fetch_node =
         graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation);
-    auto *op = new FetchOpHandle(fetch_node, fetches, i, &local_scopes_,
-                                 &local_exec_scopes_, return_merged);
+    auto *op = new FetchAsyncOpHandle(fetch_node, fetches, i, &local_scopes_,
+                                      &local_exec_scopes_, return_merged);
     fetch_ops->emplace_back(op);
 
     for (auto &p : places_) {
@@ -174,6 +180,14 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(
       op->AddInput(var);
     }
 
+    for (auto *var : vars) {
+      auto *op = var->GeneratedOp();
+      auto *compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
+      if (compute_op) {
+        compute_op->SetLockAndRecordEventFree(false);
+      }
+    }
+
     int dep = static_cast<int>(op->NotReadyInputSize());
     (*op_deps)[op] = dep;
     if (dep == 0) {
@@ -261,7 +275,7 @@ void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
 const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
 
 void FastThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) {
-  if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchOpHandle *>(op)) {
+  if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchAsyncOpHandle *>(op)) {
     traced_ops_.emplace_back(op);
   }
 }
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6aae523365ed50e78a78b318ac0990490c801eb3
--- /dev/null
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -0,0 +1,275 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+FetchAsyncOpHandle::FetchAsyncOpHandle(ir::Node *node, FetchResultType *data,
+                                       size_t offset,
+                                       std::vector<Scope *> *local_scopes,
+                                       std::vector<Scope *> *local_exec_scopes,
+                                       bool return_merged)
+    : OpHandleBase(node),
+      data_(data),
+      offset_(offset),
+      local_scopes_(local_scopes),
+      local_exec_scopes_(local_exec_scopes),
+      return_merged_(return_merged) {}
+
+FetchAsyncOpHandle::~FetchAsyncOpHandle() {}
+
+void FetchAsyncOpHandle::RecordWaitEventOnCtx(
+    platform::DeviceContext *waited_ctx) {
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "No nodes need to wait FetchAsyncOp. Unexpceted Error."));
+}
+
+static void CheckTensorAttrs(const LoDTensor *tensor,
+                             const proto::VarType::Type &type,
+                             const DataLayout &layout, const DDim &dims,
+                             const LoD &lod, const size_t offset) {
+  if (tensor->numel() && tensor->IsInitialized()) {
+    // step1: check type
+    PADDLE_ENFORCE_EQ(
+        type, tensor->type(),
+        platform::errors::InvalidArgument(
+            "The data type of fetched Tensors or the items of fetched "
+            "LoDTensorArray are different from each other on different "
+            "devices(%s vs %s). And the error is caused by the %zu "
+            "(th) fetched variable. Please set the "
+            "parameter `return_merged = False` when you "
+            "call the `Executor.run()` method.",
+            DataTypeToString(type), DataTypeToString(tensor->type()), offset));
+
+    // step2: check layout
+    PADDLE_ENFORCE_EQ(
+        layout, tensor->layout(),
+        platform::errors::InvalidArgument(
+            "The layout of fetched Tensors or the items of fetched "
+            "LoDTensorArray are different from each other on different "
+            "devices(%s vs %s). And the error is caused by the %zu "
+            "(th) fetched variable. Please set the "
+            "parameter `return_merged = False` when you "
+            "call the `Executor.run()` method.",
+            DataLayoutToString(layout), DataLayoutToString(tensor->layout()),
+            offset));
+  }
+
+  // step3: check dims
+  auto tensor_dims = tensor->dims();
+  PADDLE_ENFORCE_EQ(dims.size(), tensor_dims.size(),
+                    platform::errors::InvalidArgument(
+                        "The dimension sizes of fetched Tensors or "
+                        "the items of fetched LoDTensorArray are "
+                        "different from each other on different "
+                        "devices(%s vs %s). And the error is caused by the %zu "
+                        "(th) fetched variable. Please set the "
+                        "parameter `return_merged = False` when you "
+                        "call the `Executor.run()` method.",
+                        dims, tensor_dims, offset));
+  for (int j = 1; j < dims.size(); j++) {
+    PADDLE_ENFORCE_EQ(dims[j], tensor_dims[j],
+                      platform::errors::InvalidArgument(
+                          "The dimensions of fetched Tensors or "
+                          "the items of fetched LoDTensorArray are "
+                          "different from each other on different "
+                          "devices(%s vs %s). And the error is caused by the "
+                          "%zu (th) fetched variable. Please set the "
+                          "parameter `return_merged = False` when "
+                          "you call the `Executor.run()` method.",
+                          dims, tensor_dims, offset));
+  }
+
+  // step4: check lod
+  PADDLE_ENFORCE_EQ(
+      lod.size(), tensor->lod().size(),
+      platform::errors::InvalidArgument(
+          "The LoD information of fetched Tensors or the items of fetched "
+          "LoDTensorArray are different from each other on different "
+          "devices(%s vs %s). And the error is caused by the %zu "
+          "(th) fetched variable. Please set the "
+          "parameter `return_merged = False` when you "
+          "call the `Executor.run()` method.",
+          lod, tensor->lod(), offset));
+}
+
+static void TransData(const framework::Tensor *src_item,
+                      framework::Tensor *dst_item,
+                      const platform::DeviceContext &ctx) {
+  if (src_item->IsInitialized() && src_item->numel() > 0) {
+    if (platform::is_gpu_place(src_item->place())) {
+#ifdef PADDLE_WITH_CUDA
+      TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
+#endif
+    } else {
+      TensorCopy(*src_item, platform::CPUPlace(), dst_item);
+    }
+  }
+}
+
+void FetchAsyncOpHandle::FetchMergedLodTensor(
+    const std::vector<const LoDTensor *> &src_lodtensors,
+    LoDTensor *dst_lodtensor) {
+  // calc dst type,layout,dim,lod and calc check dim
+  proto::VarType::Type new_type = proto::VarType::FP32;
+  framework::DataLayout new_layout;
+  framework::DDim new_dim;
+  LoD new_lod = src_lodtensors[0]->lod();
+
+  framework::DDim check_dim;
+
+  for (auto *t : src_lodtensors) {
+    if (t->numel() && t->IsInitialized()) {
+      check_dim = t->dims();
+      new_type = t->type();
+      new_layout = t->layout();
+      break;
+    }
+  }
+
+  bool find_first_dims = false;
+  for (auto *t : src_lodtensors) {
+    if (t->numel() && t->IsInitialized()) {
+      if (!find_first_dims) {
+        new_dim = t->dims();
+        find_first_dims = true;
+      } else {
+        new_dim[0] += t->dims()[0];
+      }
+    }
+  }
+
+  // check src type,layout,dim,lod consistence
+  for (size_t i = 1; i < src_lodtensors.size(); ++i) {
+    CheckTensorAttrs(src_lodtensors[i], new_type, new_layout, check_dim,
+                     new_lod, offset_);
+  }
+
+  // set dst tensor
+  dst_lodtensor->Resize(new_dim);
+  dst_lodtensor->set_layout(src_lodtensors[0]->layout());
+  dst_lodtensor->set_lod(src_lodtensors[0]->lod());
+  if (platform::is_gpu_place(src_lodtensors[0]->place())) {
+    dst_lodtensor->mutable_data(platform::CUDAPinnedPlace(),
+                                src_lodtensors[0]->type());
+  } else {
+    dst_lodtensor->mutable_data(platform::CPUPlace(),
+                                src_lodtensors[0]->type());
+  }
+
+  // slice and memcpy
+  int begin = 0;
+  for (auto *src : src_lodtensors) {
+    int end = begin + src->dims()[0];
+    if (end == begin) {
+      continue;
+    }
+    auto dst = dst_lodtensor->Slice(begin, end);
+    TransData(src, &dst, *dev_ctxes_[src->place()]);
+    begin = end;
+  }
+}
+
+void FetchAsyncOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name());
+  WaitInputVarGenerated();
+
+  // get src vars
+  auto &scopes = *local_exec_scopes_;
+  std::vector<Variable *> src_vars;
+  src_vars.reserve(inputs_.size());
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
+    auto &scope = scopes.at(var_handle->scope_idx());
+    auto *var = scope->FindVar(var_handle->name());
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::NotFound(
+            "Cannot find variable %s in execution scope.", var_handle->name()));
+    src_vars.emplace_back(var);
+  }
+
+  if (return_merged_) {
+    auto &val = BOOST_GET(FetchList, *data_);
+    if (src_vars[0]->IsType<LoDTensor>()) {
+      // to lodtensor type
+      std::vector<const LoDTensor *> src_lodtensors;
+      src_lodtensors.reserve(src_vars.size());
+      for (size_t i = 0; i < src_vars.size(); ++i) {
+        src_lodtensors.emplace_back(&src_vars[i]->Get<framework::LoDTensor>());
+      }
+
+      LoDTensor dst_lodtensor;
+      FetchMergedLodTensor(src_lodtensors, &dst_lodtensor);
+      val.at(offset_) = std::move(dst_lodtensor);
+    } else {
+      // to lodtensorarray type
+      std::vector<const LoDTensorArray *> src_lodtensor_arrays;
+      src_lodtensor_arrays.reserve(src_vars.size());
+      for (size_t i = 0; i < src_vars.size(); ++i) {
+        src_lodtensor_arrays.emplace_back(
+            &src_vars[i]->Get<framework::LoDTensorArray>());
+      }
+
+      LoDTensorArray dst_lodtensor_array;
+      dst_lodtensor_array.resize(src_lodtensor_arrays[0]->size());
+
+      for (size_t i = 0; i < dst_lodtensor_array.size(); ++i) {
+        std::vector<const LoDTensor *> src_lodtensors;
+        src_lodtensors.reserve(src_lodtensor_arrays.size());
+        for (size_t j = 0; j < src_lodtensor_arrays.size(); ++j) {
+          src_lodtensors.emplace_back(&(*src_lodtensor_arrays[j])[i]);
+        }
+        FetchMergedLodTensor(src_lodtensors, &dst_lodtensor_array[i]);
+      }
+      val.at(offset_) = std::move(dst_lodtensor_array);
+    }
+  } else {
+    auto &val = BOOST_GET(FetchUnmergedList, *data_);
+    auto &dst_tensors = val.at(offset_);
+    dst_tensors.reserve(src_vars.size());
+
+    for (size_t i = 0; i < src_vars.size(); ++i) {
+      if (src_vars[i]->IsType<LoDTensor>()) {
+        auto &t = src_vars[i]->Get<framework::LoDTensor>();
+        LoDTensor item;
+        TransData(&t, &item, *dev_ctxes_[t.place()]);
+        dst_tensors.emplace_back(std::move(item));
+      } else {
+        auto &t = src_vars[i]->Get<framework::LoDTensorArray>();
+        LoDTensorArray item;
+        item.resize(t.size());
+        for (size_t j = 0; j < t.size(); ++j) {
+          TransData(&t[j], &item[j], *dev_ctxes_[t[j].place()]);
+        }
+        dst_tensors.emplace_back(std::move(item));
+      }
+    }
+  }
+}
+
+bool FetchAsyncOpHandle::IsMultiDeviceTransfer() { return true; }
+
+std::string FetchAsyncOpHandle::Name() const { return "FetchAsync"; }
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.h b/paddle/fluid/framework/details/fetch_async_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..691a3286c270badad938610811cc6e73d63c2c04
--- /dev/null
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.h
@@ -0,0 +1,63 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct FetchAsyncOpHandle : public OpHandleBase {
+ public:
+  FetchAsyncOpHandle(ir::Node *node, FetchResultType *data, size_t offset,
+                     std::vector<Scope *> *local_scopes,
+                     std::vector<Scope *> *local_exec_scopes,
+                     bool return_merged);
+
+  ~FetchAsyncOpHandle();
+
+  void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override;
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override;
+
+ protected:
+  void RunImpl() override;
+
+  std::vector<Scope *> GetLocalScopes() override { return *local_scopes_; }
+
+  void FetchMergedLodTensor(
+      const std::vector<const LoDTensor *> &src_lodtensors,
+      LoDTensor *dst_lodtensor);
+
+ private:
+  FetchResultType *data_;
+  size_t offset_;
+  std::vector<Scope *> *local_scopes_;
+  std::vector<Scope *> *local_exec_scopes_;
+  bool return_merged_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 5574a55e18c6d9806cb878dc69ec597f81da97d8..ae69960ef78c3e35143c66226133bd0dceac8b79 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -36,7 +36,8 @@ FetchOpHandle::FetchOpHandle(ir::Node *node, FetchResultType *data,
 FetchOpHandle::~FetchOpHandle() {}
 
 void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
-  PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "No nodes need to wait FetchOp. Unexpceted Error."));
 }
 
 static void CheckDims(const framework::DDim &tensor_dims,
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 956b099e883f9ea6d96db8716cb0fa693a3796d4..0ad84f5890acaf1c793000859ed3fbc7c1fc22d3 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -167,6 +167,8 @@ static void PrintNanInf(const T* value, const size_t numel, int print_num,
 // more detail see: 180 page of
 // https://www.openmp.org/wp-content/uploads/OpenMP4.0.0.pdf
 #pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in)
+#pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \
+                              omp_in)
 #endif
 
 template <typename T>
@@ -205,6 +207,21 @@ void CheckNanInf<paddle::platform::float16>(
     PrintNanInf(value, numel, print_num, op_type, var_name);
   }
 }
+
+template <>
+void CheckNanInf<paddle::platform::bfloat16>(
+    const paddle::platform::bfloat16* value, const size_t numel, int print_num,
+    const std::string& op_type, const std::string& var_name) {
+  float sum = 0.0f;
+#pragma omp parallel for reduction(+ : sum)
+  for (size_t i = 0; i < numel; ++i) {
+    sum += static_cast<float>(value[i] - value[i]);
+  }
+
+  if (std::isnan(sum) || std::isinf(sum)) {
+    PrintNanInf(value, numel, print_num, op_type, var_name);
+  }
+}
 #endif
 
 template <>
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
index 4f1e44ca26cb65468da6eded74653f34dbf00336..71123f708e3ca149d9fd634f55652cede5a57b50 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
+#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 
 namespace paddle {
 namespace framework {
@@ -23,9 +24,11 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
   if (fetch_ops->empty()) return;
 
   for (auto& op : *fetch_ops) {
-    PADDLE_ENFORCE_NOT_NULL(
-        dynamic_cast<FetchOpHandle*>(op),
-        "The input ops of ClearFetchOp function should be FetchOpHandle.");
+    PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
+                          dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
+                      true,
+                      "The input ops of ClearFetchOp function should be "
+                      "FetchOpHandle or FetchAsyncOpHandle.");
     for (auto& out_var : op->Node()->outputs) {
       graph->RemoveNode(out_var);
     }
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 180b33d0cb72e2c4c9e6e8caff9f0ef5f1b04689..915589b3242b7d5675e630aca7310185fd109ec2 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -23,6 +23,7 @@ template <typename T>
 static ::DLDataType GetDLDataTypeCode() {
   ::DLDataType dtype;
   if (std::is_same<T, platform::float16>::value ||
+      std::is_same<T, platform::bfloat16>::value ||
       std::is_floating_point<T>::value) {
     dtype.code = kDLFloat;
   } else if (std::is_unsigned<T>::value) {
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index cdf210d661c73e69e125c0ebfa85cc852360e352..34fff042770c5f50a280408d8f7f925488b3879c 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -857,7 +857,7 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
     float* g = g_tensor->data<float>();
 
     if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) {
-      int dim = emb_dim + offset;
+      int dim = emb_dim;
       Eigen::Map<
           Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
           g_mat(g, g_tensor->numel() / dim, dim);
diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
index 9bde9e20b19a0b14ce4489b91d9ab3d5273f7f9a..d51e97d98e902a87cd2a44d2019e93e8dfc30fc8 100644
--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -21,10 +21,46 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
+#include <vector>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
 
+const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
+#ifdef PADDLE_WITH_CUDA
+
+  static int64_t num_cuda_devices = -1;
+  static std::once_flag num_devices_init_flag;
+  static std::deque<std::once_flag> cuda_device_flags;
+  static std::vector<std::shared_ptr<Generator>> default_cuda_generators;
+
+  std::call_once(num_devices_init_flag, []() {
+    num_cuda_devices = paddle::platform::GetCUDADeviceCount();
+    cuda_device_flags.resize(num_cuda_devices);
+    default_cuda_generators.resize(num_cuda_devices);
+  });
+  if (device_id < 0) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "cuda device id shoule be greater than 0"));
+  }
+
+  std::call_once(cuda_device_flags[device_id], [device_id]() {
+    default_cuda_generators[device_id] =
+        std::make_shared<Generator>(GetRandomSeed(), device_id);
+    VLOG(4) << "initial seed: "
+            << default_cuda_generators[device_id]->GetCurrentSeed();
+  });
+  return default_cuda_generators[device_id];
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "getDefaultCUDAGenerator only support in CUDA place"));
+#endif
+}
+
 const std::shared_ptr<Generator>& DefaultCPUGenerator() {
   static auto default_cpu_generator =
       std::make_shared<Generator>(GetRandomSeed());
@@ -103,6 +139,7 @@ uint64_t Generator::Seed() {
 void Generator::SetCurrentSeed(uint64_t seed) {
   std::lock_guard<std::mutex> lock(this->mu_);
   this->state_.current_seed = seed;
+  this->state_.thread_offset = 0;
   std::seed_seq seq({seed});
   this->engine_->seed(seq);
 }
@@ -123,6 +160,22 @@ uint64_t Generator::Random64() {
   return (*engine)();
 }
 
+std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
+    uint64_t increament_offset) {
+  uint64_t cur_offset = this->state_.thread_offset;
+#ifdef PADDLE_WITH_CUDA
+  std::lock_guard<std::mutex> lock(this->mu_);
+
+  this->state_.thread_offset += increament_offset;
+
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "Increment Offset only support in CUDA place"));
+#endif
+  return std::make_pair(static_cast<int>(this->state_.current_seed),
+                        cur_offset);
+}
+
 void Generator::SetIsInitPy(bool is_init_py) {
   this->is_init_py_ = is_init_py;
   VLOG(4) << "SetIsInitPy:" << this->is_init_py_;
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
index 82b35f7ad550e770e8d10457ddf6cdf8e6fbd709..a279c2e4e1458293b6579b7b7cb2111e440e5d5e 100644
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -38,6 +38,7 @@ static uint64_t GetRandomSeed() {
 struct GeneratorState {
   int64_t device = -1;
   uint64_t current_seed = 34342423252;
+  uint64_t thread_offset = 0;
   std::mt19937_64 cpu_engine;
 };
 
@@ -49,6 +50,7 @@ struct Generator {
     this->state_.cpu_engine = *engine;
     this->state_.device = -1;
     this->state_.current_seed = seed;
+    this->state_.thread_offset = 0;
     this->engine_ = engine;
     VLOG(4) << "initial seed: " << this->state_.current_seed
             << ", cpu engine: " << &this->state_.cpu_engine;
@@ -59,11 +61,25 @@ struct Generator {
     this->state_.cpu_engine = *engine;
     this->state_.device = -1;
     this->state_.current_seed = seed;
+    this->state_.thread_offset = 0;
     this->engine_ = engine;
     VLOG(4) << "initial seed: " << this->state_.current_seed
             << ", cpu engine: " << &this->state_.cpu_engine;
     this->is_init_py_ = true;  // TODO(zhiqiu): remove it in future
   }
+  Generator(uint64_t seed, uint64_t device_id) {
+    std::seed_seq seq({seed});
+    auto engine = std::make_shared<std::mt19937_64>(seq);
+    this->state_.cpu_engine = *engine;
+    this->state_.device = device_id;
+    this->state_.current_seed = seed;
+    this->state_.thread_offset = 0;
+    this->engine_ = engine;
+    VLOG(4) << "initial seed: " << this->state_.current_seed
+            << ", cpu engine: " << &this->state_.cpu_engine;
+    this->is_init_py_ = false;  // TODO(zhiqiu): remove it in future
+  }
+
   Generator(const Generator& other) = delete;
 
   // get random state
@@ -83,8 +99,11 @@ struct Generator {
 
   uint64_t Random64();
 
+  std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t increament_offset);
+
   void SetIsInitPy(bool);
   bool GetIsInitPy() const;
+  uint64_t get_device_id() { return this->state_.device; }
 
  private:
   GeneratorState state_;
@@ -105,5 +124,8 @@ std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine();
 
 std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t);
 
+const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(
+    int64_t device_id = -1);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index 79b15fc87d0b0a0ade8324710b80af634ff8878f..5edd70e035f98f408c0104297e084771cd158f53 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -133,6 +133,9 @@ class OpVersion {
     checkpoints_.push_back(Checkpoint({note, op_version_desc}));
     return *this;
   }
+  uint32_t GetVersionID() const {
+    return static_cast<uint32_t>(checkpoints_.size());
+  }
 
  private:
   struct Checkpoint {
@@ -156,6 +159,14 @@ class OpVersionRegistrar {
     op_version_map_.insert({op_type, OpVersion()});
     return op_version_map_[op_type];
   }
+  uint32_t GetVersionID(const std::string& op_type) const {
+    auto it = op_version_map_.find(op_type);
+    if (it == op_version_map_.end()) {
+      return 0;
+    }
+
+    return it->second.GetVersionID();
+  }
 
  private:
   std::unordered_map<std::string, OpVersion> op_version_map_;
@@ -164,6 +175,125 @@ class OpVersionRegistrar {
   OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
 };
 
+class OpVersionComparator {
+ public:
+  virtual bool operator()() = 0;
+  virtual ~OpVersionComparator() = default;
+};
+
+#define ADD_OP_VERSION_COMPARATOR(cmp_name, cmp_math)                   \
+  class OpVersion##cmp_name##Comparator : public OpVersionComparator {  \
+   public:                                                              \
+    explicit OpVersion##cmp_name##Comparator(const std::string op_name, \
+                                             uint32_t target_version)   \
+        : op_name_(op_name), target_version_(target_version) {}         \
+    virtual bool operator()() {                                         \
+      return OpVersionRegistrar::GetInstance().GetVersionID(op_name_)   \
+          cmp_math target_version_;                                     \
+    }                                                                   \
+    virtual ~OpVersion##cmp_name##Comparator() {}                       \
+                                                                        \
+   private:                                                             \
+    std::string op_name_;                                               \
+    uint32_t target_version_;                                           \
+  };
+
+ADD_OP_VERSION_COMPARATOR(LE, <=);
+ADD_OP_VERSION_COMPARATOR(EQ, ==);
+ADD_OP_VERSION_COMPARATOR(GE, >=);
+ADD_OP_VERSION_COMPARATOR(NE, !=);
+
+class OpVersionComparatorCombination {
+ public:
+  OpVersionComparatorCombination() {}
+
+  OpVersionComparatorCombination& LE(const std::string& op_name,
+                                     int target_version) {
+    op_version_comparators_.push_back(std::shared_ptr<OpVersionComparator>(
+        new OpVersionLEComparator(op_name, target_version)));
+    return *this;
+  }
+  OpVersionComparatorCombination& EQ(const std::string& op_name,
+                                     int target_version) {
+    op_version_comparators_.push_back(std::shared_ptr<OpVersionComparator>(
+        new OpVersionEQComparator(op_name, target_version)));
+    return *this;
+  }
+  OpVersionComparatorCombination& GE(const std::string& op_name,
+                                     int target_version) {
+    op_version_comparators_.push_back(std::shared_ptr<OpVersionComparator>(
+        new OpVersionGEComparator(op_name, target_version)));
+    return *this;
+  }
+  OpVersionComparatorCombination& NE(const std::string& op_name,
+                                     int target_version) {
+    op_version_comparators_.push_back(std::shared_ptr<OpVersionComparator>(
+        new OpVersionNEComparator(op_name, target_version)));
+    return *this;
+  }
+
+  bool IsMatched() const {
+    for (const auto& cmp : op_version_comparators_) {
+      if (!(*cmp)()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  std::vector<std::shared_ptr<OpVersionComparator>> op_version_comparators_;
+};
+
+class PassVersionCheckers {
+ public:
+  PassVersionCheckers& AddCombination(
+      const OpVersionComparatorCombination& combinations) {
+    pass_version_checkers_.push_back(combinations);
+    return *this;
+  }
+  bool IsPassCompatible() const {
+    if (pass_version_checkers_.empty()) {
+      return true;
+    }
+    for (const auto& checker : pass_version_checkers_) {
+      if (checker.IsMatched()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  std::vector<OpVersionComparatorCombination> pass_version_checkers_;
+};
+
+class PassVersionCheckerRegistrar {
+ public:
+  static PassVersionCheckerRegistrar& GetInstance() {
+    static PassVersionCheckerRegistrar instance;
+    return instance;
+  }
+  PassVersionCheckers& Register(const std::string& pass_name) {
+    return pass_version_checkers_map_[pass_name];
+  }
+  bool IsPassCompatible(const std::string& fuse_pass_name) const {
+    auto iter = pass_version_checkers_map_.find(fuse_pass_name);
+    if (iter == pass_version_checkers_map_.end()) {
+      return true;
+    }
+    return iter->second.IsPassCompatible();
+  }
+
+ private:
+  std::unordered_map<std::string, PassVersionCheckers>
+      pass_version_checkers_map_;
+
+  PassVersionCheckerRegistrar() = default;
+  PassVersionCheckerRegistrar& operator=(const PassVersionCheckerRegistrar&) =
+      delete;
+};
+
 }  // namespace compatible
 }  // namespace framework
 }  // namespace paddle
@@ -173,3 +303,9 @@ class OpVersionRegistrar {
       RegisterOpVersion__##op_type =                                       \
           paddle::framework::compatible::OpVersionRegistrar::GetInstance() \
               .Register(#op_type)
+
+#define REGISTER_PASS_CAPABILITY(pass_name)                        \
+  static auto RegisterOpPassVersionChecker__##pass_name =          \
+      paddle::framework::compatible::PassVersionCheckerRegistrar:: \
+          GetInstance()                                            \
+              .Register(#pass_name)
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
index 80ad51ad07b5a84cfabb3ace9b478b1f6ea24f95..239dbc4357854a8962567129b259a64260308b49 100644
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -55,6 +55,72 @@ TEST(test_operator_version, test_operator_version) {
               .NewInput("X2", "The second input.")
               .NewOutput("Y2", "The second output."));
 }
+
+TEST(test_pass_op_version_checker, test_pass_op_version_checker) {
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "no_bind_pass"));
+
+  REGISTER_PASS_CAPABILITY(test_pass1)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .LE("mul", 1)
+              .EQ("fc", 0));
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass1"));
+
+  REGISTER_PASS_CAPABILITY(test_pass2)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .GE("mul", 0)
+              .NE("fc", 0));
+  ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass2"));
+
+  REGISTER_PASS_CAPABILITY(test_pass3)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .GE("mul", 0)
+              .NE("fc", 0))
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .LE("mul", 1)
+              .EQ("fc", 0));
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass3"));
+
+  REGISTER_PASS_CAPABILITY(test_pass4)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .GE("test__", 5)
+              .EQ("fc", 0));
+  ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass4"));
+
+  REGISTER_PASS_CAPABILITY(test_pass5)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .GE("test__", 4)
+              .EQ("fc", 0));
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass5"));
+
+  REGISTER_PASS_CAPABILITY(test_pass6)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .EQ("test__", 4)
+              .EQ("fc", 0));
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass6"));
+
+  REGISTER_PASS_CAPABILITY(test_pass7)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .NE("test__", 4)
+              .EQ("fc", 0));
+  ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass7"));
+}
+
 }  // namespace compatible
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 6fbf880356c541e72cae6f3b03efe017042254ff..9eb8478515727cf04f9d16e9a38a8f4c3ec9c683 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -90,32 +90,6 @@ void MemoryOptimizePass::CollectLifeCycle(
   }
 }
 
-// TODO(Superjomn) Make this a general help method.
-int DataTypeToSpace(framework::proto::VarType_Type type) {
-  switch (type) {
-    case framework::proto::VarType_Type_BOOL:
-      return sizeof(bool);
-    case framework::proto::VarType_Type_FP32:
-      return sizeof(float);
-    case framework::proto::VarType_Type_INT32:
-      return sizeof(int32_t);
-    case framework::proto::VarType_Type_INT64:
-      return sizeof(int64_t);
-    case framework::proto::VarType_Type_INT16:
-      return sizeof(int16_t);
-    case framework::proto::VarType_Type_FP16:
-      return sizeof(int16_t);
-    case framework::proto::VarType_Type_FP64:
-      return sizeof(double);
-    case framework::proto::VarType_Type_UINT8:
-      return sizeof(unsigned char);
-    case framework::proto::VarType_Type_INT8:
-      return sizeof(int8_t);
-    default:
-      PADDLE_THROW("Unknown data type");
-  }
-}
-
 void MemoryOptimizePass::CollectVarMemorySize(
     space_table_t* space_table) const {
   const int fake_batch_size = 1;
@@ -163,7 +137,7 @@ void MemoryOptimizePass::CollectVarMemorySize(
       int size = std::accumulate(shape.begin(), shape.end(), 1,
                                  std::multiplies<int>());
       (*space_table)[node->Var()->Name()] =
-          size * DataTypeToSpace(node->Var()->GetDataType());
+          size * paddle::framework::SizeOfType(node->Var()->GetDataType());
     }
   }
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 127a41aee890808258367fb40804a9547b8fdbb0..500aa8341d6a61056f6f80f82c6f28bb569eb772 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1058,6 +1058,7 @@ USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
 USE_TRT_CONVERTER(skip_layernorm);
 USE_TRT_CONVERTER(slice);
 USE_TRT_CONVERTER(scale);
+USE_TRT_CONVERTER(stack);
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/lite/test_engine.cc b/paddle/fluid/inference/lite/test_engine.cc
index 325c7ab2539f28f5145ee88a1bbf374f333348e1..d29bcb76be78f151dc606d9f335e9df9ed19b16b 100644
--- a/paddle/fluid/inference/lite/test_engine.cc
+++ b/paddle/fluid/inference/lite/test_engine.cc
@@ -14,15 +14,16 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/inference/lite/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/operators/lite/ut_helper.h"
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 
+#include "paddle/fluid/inference/lite/engine.h"
+#include "paddle/fluid/operators/lite/ut_helper.h"
+
 namespace paddle {
 namespace inference {
 namespace lite {
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 8b7371490c09068fd4b84ddb541014204806a2b2..39d02909abd1f1d96f73cc9f3e3ea9d26a1f5c72 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -3,8 +3,8 @@ nv_library(tensorrt_converter
            SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
                 batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
                 pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc
-emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc
+                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc
+                emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/scale_op.cc b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
index 19e1895635aa7670a0ca453656c3407d132e8db4..f9a1fe41ddc046aad8cc3a5397453b0f68c1a112 100644
--- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
@@ -58,6 +58,24 @@ class ScaleOpConverter : public OpConverter {
     TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                          0};
     nvinfer1::ILayer* layer = nullptr;
+
+    auto input_dim = input->getDimensions();
+    PADDLE_ENFORCE_GE(input_dim.nbDims, 3,
+                      platform::errors::Fatal(
+                          "Paddle-TRT scale mode only support dimension >= 3"));
+
+    nvinfer1::IShuffleLayer* expand_layer = nullptr;
+    nvinfer1::IShuffleLayer* squeeze_layer = nullptr;
+
+    if (input_dim.nbDims == 3) {
+      // TensorRT scale layer is not supporting input dims < 4 when using
+      // explicit batch
+      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+      nvinfer1::Dims4 target_shape(0, 0, 0, 1);  // expand 1 dims
+      expand_layer->setReshapeDimensions(target_shape);
+      input = expand_layer->getOutput(0);
+    }
+
     if (bias_after_scale) {
       layer = TRT_ENGINE_ADD_LAYER(
           engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM,
@@ -73,6 +91,18 @@ class ScaleOpConverter : public OpConverter {
           power_weights.get(), scale_weights.get(), power_weights.get());
     }
 
+    PADDLE_ENFORCE_EQ(layer != nullptr, true,
+                      platform::errors::Fatal("Create scale layer failed."));
+
+    if (input_dim.nbDims == 3) {
+      // TensorRT scale layer is not supporting input dims < 4 when using
+      // explicit batch
+      squeeze_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
+      nvinfer1::Dims3 target_shape(0, 0, 0);  // expand 1 dims
+      squeeze_layer->setReshapeDimensions(target_shape);
+      layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+    }
     RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode);
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f35024529c61a253f314e5eca985713227d3f343
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Stack converter from fluid to tensorRT.
+ */
+class StackOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert fluid stack op to tensorrt stack layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    auto input = op_desc.Input("X");
+    int input_num = input.size();
+    nvinfer1::ITensor** inputs =
+        (nvinfer1::ITensor**)malloc(input_num * sizeof(nvinfer1::ITensor*));
+
+    for (int i = 0; i < input_num; ++i) {
+      inputs[i] = engine_->GetITensor(input[i]);
+    }
+
+    int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
+    if (axis < 0) {
+      axis = axis + inputs[0]->getDimensions().nbDims + 1;
+    }
+
+    nvinfer1::ILayer* layer = nullptr;
+    if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
+      plugin::StackPluginDynamic* plugin =
+          new plugin::StackPluginDynamic(axis, input_num);
+      layer = engine_->AddPluginV2(inputs, input_num, plugin);
+      assert(layer != nullptr);
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the TRT Dynamic Shape mode, need to confirm that "
+          "your TRT version is no less than 6.0"));
+#endif
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the Ernie(Bert) model in static"
+          "shape mode, which is not supported for the time being.\n"
+          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
+          " to set the shape information to run the dynamic shape mode."));
+    }
+    auto output_name = op_desc.Output("Y").front();
+    RreplenishLayerAndOutput(layer, "stack", {output_name}, test_mode);
+    free(inputs);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(stack, StackOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index e8cbb9431cb3c79ed6c6269f96c256fd50afb121..a5b71356d0eca43555f4190b8cac2055a3eb679c 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -88,6 +88,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "gelu",
       "layer_norm",
       "scale",
+      "stack",
   };
 };
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index e417fcbb2ce9267ad491996063e5725799815f55..98afdbe254a4b0a086d4a4aa88096a06c40138d1 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1,7 +1,8 @@
 nv_library(tensorrt_plugin
            SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
-           prelu_op_plugin.cu  trt_plugin_factory.cc gelu_op_plugin.cu 
+           prelu_op_plugin.cu trt_plugin_factory.cc gelu_op_plugin.cu
            pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu
-instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
-qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu hard_swish_op_plugin.cu
-           DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) 
+           instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
+           qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu
+           hard_swish_op_plugin.cu stack_op_plugin.cu
+           DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
index 48afcfce347d681fbbb291e478ead1fa28475a22..1fa5b3228e1158fe0423c457d974e0bbf970a30f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
@@ -104,32 +104,51 @@ nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions(
 
   auto stri_0 = expr_builder.constant(strides_[0]);
   auto stri_1 = expr_builder.constant(strides_[1]);
+  auto one_value = expr_builder.constant(1);
 
-  auto tmp1_0 =
-      expr_builder.constant((-ksize_[0] + 2 * paddings_[0]) / strides_[0] + 1);
-  auto tmp1_1 =
-      expr_builder.constant((-ksize_[1] + 2 * paddings_[1]) / strides_[1] + 1);
+  auto v0_tmp = expr_builder.constant(-ksize_[0] + 2 * paddings_[0]);
+  auto v1_tmp = expr_builder.constant(-ksize_[1] + 2 * paddings_[1]);
 
-  auto tmp2_0 = expr_builder.constant(
-      (-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1) / strides_[0] + 1);
-  auto tmp2_1 = expr_builder.constant(
-      (-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1) / strides_[1] + 1);
-
-  auto *a_d = expr_builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV,
-                                     *inputs[0].d[2], *stri_0);
-  auto *b_d = expr_builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV,
-                                     *inputs[0].d[3], *stri_1);
+  auto ceil_tmp =
+      expr_builder.constant(-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1);
+  auto ceil1_tmp =
+      expr_builder.constant(-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1);
 
   if (!ceil_mode_) {
-    output.d[2] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
-                                         *a_d, *tmp1_0);
-    output.d[3] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
-                                         *b_d, *tmp1_1);
+    output.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[2], *v0_tmp),
+            *stri_0),
+        *one_value);
+    output.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[3], *v1_tmp),
+            *stri_1),
+        *one_value);
+
   } else {
-    output.d[2] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
-                                         *a_d, *tmp2_0);
-    output.d[3] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
-                                         *b_d, *tmp2_1);
+    output.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[2], *ceil_tmp),
+            *stri_0),
+        *one_value);
+    output.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[3], *ceil1_tmp),
+            *stri_1),
+        *one_value);
   }
 
   return output;
diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1ecbf4be154f01059ef33e2d510d8329d6726314
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
@@ -0,0 +1,247 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include <cstring>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+StackPluginDynamic::StackPluginDynamic(int axis, int num_stack)
+    : axis_(axis), num_stack_(num_stack) {}
+
+StackPluginDynamic::StackPluginDynamic(void const* serial_data,
+                                       size_t serial_length) {
+  DeserializeValue(&serial_data, &serial_length, &axis_);
+  DeserializeValue(&serial_data, &serial_length, &num_stack_);
+}
+
+StackPluginDynamic::~StackPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt* StackPluginDynamic::clone() const {
+  return new StackPluginDynamic(axis_, num_stack_);
+}
+
+const char* StackPluginDynamic::getPluginType() const { return "stack_plugin"; }
+
+int StackPluginDynamic::getNbOutputs() const { return 1; }
+
+int StackPluginDynamic::initialize() { return 0; }
+
+size_t StackPluginDynamic::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(axis_);
+  serialize_size += SerializedSize(num_stack_);
+  return serialize_size;
+}
+
+void StackPluginDynamic::serialize(void* buffer) const {
+  SerializeValue(&buffer, axis_);
+  SerializeValue(&buffer, num_stack_);
+}
+
+nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder) {
+  nvinfer1::DimsExprs output(inputs[0]);
+  output.nbDims = inputs[0].nbDims + 1;
+
+  for (int i = inputs[0].nbDims; i > axis_; --i) {
+    output.d[i] = inputs[0].d[i - 1];
+  }
+  output.d[axis_] = expr_builder.constant(nb_inputs);
+  return output;
+}
+
+void StackPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+
+size_t StackPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+  return num_stack_ * sizeof(uintptr_t);
+}
+
+void StackPluginDynamic::destroy() { delete this; }
+
+void StackPluginDynamic::terminate() {}
+
+bool StackPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
+    int nb_outputs) {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of stack plugin should not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+
+  const nvinfer1::PluginTensorDesc& in = in_out[pos];
+  if (pos == 0) {
+#ifdef SUPPORTS_CUDA_FP16
+    return (in.type == nvinfer1::DataType::kFLOAT ||
+            in.type == nvinfer1::DataType::kHALF) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+#else
+    return (in.type == nvinfer1::DataType::kFLOAT) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+#endif
+  }
+  const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
+  // output
+  return in.type == prev.type && in.format == prev.format;
+}
+
+nvinfer1::DataType StackPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The index should be equal to 0"));
+  return input_types[0];
+}
+
+template <typename T>
+__global__ void StackKernel(const T* const* input, T* output, int num_stack,
+                            int base_unit) {
+  int stack_id = blockIdx.x;
+  int lead_id = blockIdx.y;
+
+  for (int i = threadIdx.x; i < base_unit; i += blockDim.x) {
+    output[lead_id * num_stack * base_unit + stack_id * base_unit + i] =
+        input[stack_id][lead_id * base_unit + i];
+  }
+}
+
+int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+                                const nvinfer1::PluginTensorDesc* output_desc,
+                                const void* const* inputs, void* const* outputs,
+                                void* workspace, cudaStream_t stream) {
+  auto input_dims = input_desc[0].dims;  // (batch, seq, seq)
+  auto out_dims = output_desc[0].dims;   // (batch, num_head, seq, seq)
+  auto out_num_dims = out_dims.nbDims;
+
+  int base_unit = 1;
+  for (int i = axis_ + 1; i < out_num_dims; ++i) {
+    PADDLE_ENFORCE_GT(out_dims.d[i], 0,
+                      platform::errors::InvalidArgument(
+                          "Input dimensions should be greater than 0"));
+    base_unit *= out_dims.d[i];
+  }
+
+  int lead_unit = 1;
+  for (int i = 0; i < axis_; ++i) {
+    PADDLE_ENFORCE_GT(out_dims.d[i], 0,
+                      platform::errors::InvalidArgument(
+                          "Input dimensions should be greater than 0"));
+    lead_unit *= out_dims.d[i];
+  }
+
+  PADDLE_ENFORCE_EQ(
+      out_dims.d[axis_], num_stack_,
+      platform::errors::InvalidArgument("number of stack axis should be same"));
+
+  cudaMemcpyAsync(workspace, reinterpret_cast<const void* const>(inputs),
+                  sizeof(void*) * out_dims.d[axis_], cudaMemcpyHostToDevice,
+                  stream);
+
+  const int num_stacks = out_dims.d[axis_];
+  dim3 num_blocks(num_stacks, lead_unit);
+  const int num_threads = 256;
+  auto infer_type = input_desc[0].type;
+
+  if (infer_type == nvinfer1::DataType::kFLOAT) {
+    float* output = static_cast<float*>(outputs[0]);
+    StackKernel<float><<<num_blocks, num_threads, 0, stream>>>(
+        reinterpret_cast<const float* const*>(workspace), output, num_stacks,
+        base_unit);
+  } else if (infer_type == nvinfer1::DataType::kHALF) {
+#ifdef SUPPORTS_CUDA_FP16
+    __half* output = static_cast<__half*>(outputs[0]);
+    StackKernel<__half><<<num_blocks, num_threads, 0, stream>>>(
+        reinterpret_cast<const __half* const*>(workspace), output, num_stacks,
+        base_unit);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "The cuda archs you specific should greater than 600."));
+#endif
+  } else {
+    PADDLE_THROW(
+        platform::errors::Fatal("The Stack TRT Plugin's input type only "
+                                "support float or half currently."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+
+StackPluginDynamicCreator::StackPluginDynamicCreator() {}
+
+const char* StackPluginDynamicCreator::getPluginName() const {
+  return "stack_plugin";
+}
+
+const char* StackPluginDynamicCreator::getPluginVersion() const { return "1"; }
+
+const nvinfer1::PluginFieldCollection*
+StackPluginDynamicCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  int axis = -1;
+  int num_stack = -1;
+
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const std::string name(fc->fields[i].name);
+    if (name == "axis") {
+      axis = static_cast<const int*>(fc->fields[i].data)[0];
+    } else if (name == "num_stack") {
+      num_stack = static_cast<const int*>(fc->fields[i].data)[0];
+    } else {
+      PADDLE_THROW(platform::errors::Fatal("Meet an unknown plugin field '" +
+                                           name +
+                                           "' when creating stack op plugin."));
+    }
+  }
+  return new StackPluginDynamic(axis, num_stack);
+}
+
+nvinfer1::IPluginV2* StackPluginDynamicCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new StackPluginDynamic(serial_data, serial_length);
+  return plugin;
+}
+
+void StackPluginDynamicCreator::setPluginNamespace(const char* lib_namespace) {
+  plugin_namespace_ = lib_namespace;
+}
+
+const char* StackPluginDynamicCreator::getPluginNamespace() const {
+  return plugin_namespace_.c_str();
+}
+
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4f6cde6f87ea97c514e68bc2862bb163b0aa448
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdio.h>
+#include <cassert>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class StackPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit StackPluginDynamic(int axis, int num_stack);
+  StackPluginDynamic(void const* serial_data, size_t serial_length);
+  ~StackPluginDynamic();
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  const char* getPluginType() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+
+ private:
+  int axis_;
+  int num_stack_;
+};
+
+class StackPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  StackPluginDynamicCreator();
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+
+ private:
+  std::string plugin_namespace_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(StackPluginDynamicCreator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 07af5c152b1cd42d1034ed9c5a1d8d8bc3782827..ac05b08b8f2a038234e7192f47a37b3ef3bcf461 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -132,7 +132,9 @@ if(NOT APPLE AND WITH_MKLML)
     set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
     download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
     inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
-    set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
+    if(NOT WIN32)
+        set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
+    endif()
 else()
     # TODO: fix this test on MACOS and OPENBLAS, the reason is that
     # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
@@ -192,8 +194,9 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz")
 inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
     EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
-
-set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
+if(NOT WIN32 AND NOT APPLE)
+    set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
+endif()
 
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
@@ -215,7 +218,7 @@ inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_test
 
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
-if (NOT EXISTS ${OCR_INSTALL_DIR})
+if (NOT EXISTS ${OCR_INSTALL_DIR}/ocr.tar.gz)
     inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
@@ -231,7 +234,7 @@ set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysi
 
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
-if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
+if (NOT EXISTS ${MOBILENET_INSTALL_DIR}/mobilenet.tar.gz)
     inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
@@ -395,15 +398,15 @@ inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models")
-    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
+    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models.tar.gz)
         inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz")
     endif()
     set(TEST_SPLIT_CONVERTER_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_split_op_converter_test")
-    if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL})
+    if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL}/split_converter.tgz)
         inference_download_and_uncompress(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz")
     endif()
     set(TEST_INSTANCE_NORM_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_instance_norm_test")
-    if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL})
+    if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL}/instance_norm.tgz)
         inference_download_and_uncompress(${TEST_INSTANCE_NORM_MODEL} ${INFERENCE_URL}/tensorrt_test "instance_norm.tgz")
     endif()
     inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc
@@ -432,7 +435,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
      
     set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR})
+    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz)
         inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc
@@ -440,7 +443,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})
 
     set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR})
+    if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR}/yolov3_r50_quant_aware.tgz)
         inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc
@@ -448,12 +451,12 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_QUANT_YOLOV3_DIR})
 
     set(TEST_TRT_DYNAMIC_MODEL2 "${TRT_MODEL_INSTALL_DIR}/complex_model_dynamic")
-    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2})
+    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2}/complex_model_dynamic2.tar.gz)
         inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL2} ${INFERENCE_URL}/tensorrt_test "complex_model_dynamic2.tar.gz")
     endif()
 
     set(TEST_TRT_DYNAMIC_MODEL "${TRT_MODEL_INSTALL_DIR}/conv_bn_swish_split_gelu")
-    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL})
+    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL}/conv_bn_swish_split_gelu.tar.gz)
         inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test "conv_bn_swish_split_gelu.tar.gz")
     endif()
     inference_analysis_test(trt_dynamic_shape_test SRCS trt_dynamic_shape_test.cc
@@ -461,7 +464,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR})
 
     set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL})
+    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4.tar.gz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4.tar.gz")
     endif()
 
@@ -470,7 +473,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
 
     set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL})
+    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
     endif()
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
index da0c93d21b7852e06b6805230078540063c2b243..c60e0a25f28c01c453276a8ef04eb79b35b7dda2 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
@@ -54,9 +54,6 @@ TEST(PD_AnalysisConfig, use_gpu) {
   PD_SwitchIrOptim(config, true);
   bool ir_optim = PD_IrOptim(config);
   CHECK(ir_optim) << "NO";
-  PD_EnableMkldnnBfloat16(config);
-  bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
-  CHECK(!bfloat16_enable) << "NO";
   PD_EnableTensorRtEngine(config, 1 << 20, 1, 3, Precision::kFloat32, false,
                           false);
   bool trt_enable = PD_TensorrtEngineEnabled(config);
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
index 7e5dfa2424dbca4fb3a8a08e3d7fa7fbc3060d3d..524e08891f4e90d8a322822e26d75689526d30f5 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -90,7 +90,6 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
 
   config.SwitchUseFeedFetchOps(false);
 
-  int head_number = 12;
   int batch = 1;
   int min_seq_len = 1;
   int max_seq_len = 128;
@@ -104,17 +103,17 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
       {"read_file_0.tmp_0", min_shape},
       {"read_file_0.tmp_1", min_shape},
       {"read_file_0.tmp_2", min_shape},
-      {"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}};
+      {"matmul_0.tmp_0", {batch, min_seq_len, min_seq_len}}};
   std::map<std::string, std::vector<int>> max_input_shape = {
       {"read_file_0.tmp_0", max_shape},
       {"read_file_0.tmp_1", max_shape},
       {"read_file_0.tmp_2", max_shape},
-      {"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}};
+      {"matmul_0.tmp_0", {batch, max_seq_len, max_seq_len}}};
   std::map<std::string, std::vector<int>> opt_input_shape = {
       {"read_file_0.tmp_0", opt_shape},
       {"read_file_0.tmp_1", opt_shape},
       {"read_file_0.tmp_2", opt_shape},
-      {"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}};
+      {"matmul_0.tmp_0", {batch, opt_seq_len, opt_seq_len}}};
 
   auto precision = AnalysisConfig::Precision::kFloat32;
   if (with_fp16) {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
index c99ebcdcb5f319f73b7fd931d13f27684db39cad..17fedc3d3b8bb8451fac76f6c7dec4ac057fd1d2 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -90,7 +90,6 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
 
   config.SwitchUseFeedFetchOps(false);
 
-  int head_number = 12;
   int batch = 1;
   int min_seq_len = 1;
   int max_seq_len = 128;
@@ -104,17 +103,17 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
       {"read_file_0.tmp_0", min_shape},
       {"read_file_0.tmp_1", min_shape},
       {"read_file_0.tmp_2", min_shape},
-      {"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}};
+      {"matmul_0.tmp_0", {batch, min_seq_len, min_seq_len}}};
   std::map<std::string, std::vector<int>> max_input_shape = {
       {"read_file_0.tmp_0", max_shape},
       {"read_file_0.tmp_1", max_shape},
       {"read_file_0.tmp_2", max_shape},
-      {"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}};
+      {"matmul_0.tmp_0", {batch, max_seq_len, max_seq_len}}};
   std::map<std::string, std::vector<int>> opt_input_shape = {
       {"read_file_0.tmp_0", opt_shape},
       {"read_file_0.tmp_1", opt_shape},
       {"read_file_0.tmp_2", opt_shape},
-      {"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}};
+      {"matmul_0.tmp_0", {batch, opt_seq_len, opt_seq_len}}};
 
   auto precision = AnalysisConfig::Precision::kFloat32;
   if (with_fp16) {
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index b9f979f96d4b106642795151fb8e34b025b2caef..8bc10f2147fa29102b242ce22e78a88453d6cee4 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -45,7 +45,7 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
 endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
-if(NOT EXISTS ${WORD2VEC_INSTALL_DIR})
+if(NOT EXISTS ${WORD2VEC_INSTALL_DIR}/word2vec.inference.model.tar.gz)
   inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
index 7aaaa0002c5ab31af72c75e69f5a283c09633ba4..58b56bdcf5614ed9183ce3bf11c1767f92650d20 100644
--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -62,11 +62,11 @@ __global__ void affine_grid_kernel(const int count, int n, int out_h, int out_w,
 
     int theta_offset = n * 6;  // 2 * 3;
     // affine from (h_coor, w_coor) to (x, y)
-    output[index * 2] = theta[theta_offset] * h_coor +
-                        theta[theta_offset + 1] * w_coor +
+    output[index * 2] = theta[theta_offset] * w_coor +
+                        theta[theta_offset + 1] * h_coor +
                         theta[theta_offset + 2];
-    output[index * 2 + 1] = theta[theta_offset + 3] * h_coor +
-                            theta[theta_offset + 4] * w_coor +
+    output[index * 2 + 1] = theta[theta_offset + 3] * w_coor +
+                            theta[theta_offset + 4] * h_coor +
                             theta[theta_offset + 5];
   }
 }
@@ -86,13 +86,13 @@ __global__ void affine_grid_grad_kernel(const int count, int n, int out_h,
 
     int theta_offset = n * 6;  // 2 * 3;
     T out_grad_x = out_grad[index * 2];
-    platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
-    platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * h_coor);
     platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);
 
     T out_grad_y = out_grad[index * 2 + 1];
-    platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
-    platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * h_coor);
     platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
   }
 }
diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
index fd7fa17ac9ae5e540176bb583cf87fa3d00d2945..a82134921ef64f89151eb9c521ea3cbb6f83ee7b 100644
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
 
 REGISTER_OPERATOR(
@@ -31,3 +32,20 @@ REGISTER_OP_CPU_KERNEL(
                                     int16_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
                                     uint8_t>);
+REGISTER_OP_VERSION(arg_max)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade argmax add a new attribute [flatten] and modify the attribute of dtype)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewAttr("flatten",
+                     "In order to compute the argmax over the flattened array "
+                     "when the "
+                     "argument `axis` in python API is None.",
+                     false)
+            .ModifyAttr(
+                "dtype",
+                "change the default value of dtype, the older version "
+                "is -1, means return the int64 indices."
+                "The new version is 3, return the int64 indices directly."
+                "And supporting the dtype of -1 in new version.",
+                3));
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index ae3637f6f99783d70bd57a3935a979b0387692de..c296ddcfbef703e8484b6ea0b7f96f037e415186 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -70,6 +70,8 @@ struct VisitDataArgMinMaxFunctor {
     auto axis = ctx.Attr<int64_t>("axis");
     auto keepdims = ctx.Attr<bool>("keepdims");
     const bool& flatten = ctx.Attr<bool>("flatten");
+    // paddle do not have the scalar tensor, just return the shape [1] tensor
+    if (flatten) keepdims = true;
 
     // if flatten, will construct the new dims for the cacluate
     framework::DDim x_dims;
@@ -164,15 +166,42 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
 
+    const int& dtype = ctx->Attrs().Get<int>("dtype");
+    PADDLE_ENFORCE_EQ(
+        (dtype < 0 || dtype == 2 || dtype == 3), true,
+        platform::errors::InvalidArgument(
+            "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
+            "received [%s]",
+            paddle::framework::DataTypeToString(
+                framework::proto::VarType::INT32),
+            paddle::framework::DataTypeToString(
+                framework::proto::VarType::INT64),
+            paddle::framework::DataTypeToString(
+                static_cast<framework::proto::VarType::Type>(dtype))));
+
+    auto x_rank = x_dims.size();
+    if (axis < 0) axis += x_rank;
+    if (ctx->IsRuntime()) {
+      if (dtype == framework::proto::VarType::INT32) {
+        int64_t all_element_num = 0;
+        if (flatten) {
+          all_element_num = framework::product(x_dims);
+
+        } else {
+          all_element_num = x_dims[axis];
+        }
+        PADDLE_ENFORCE_LE(
+            all_element_num, INT_MAX,
+            "The element num of the argmin/argmax input at axis is "
+            "%d, is larger than int32 maximum value:%d, you must "
+            "set the dtype of argmin/argmax to 'int64'.",
+            all_element_num, INT_MAX);
+      }
+    }
     std::vector<int64_t> vec;
     if (flatten) {
-      // if is flatten, will return the only on element
-      if (keepdims) {
-        vec.emplace_back(static_cast<int64_t>(1));
-      }
+      vec.emplace_back(static_cast<int64_t>(1));
     } else {
-      auto x_rank = x_dims.size();
-      if (axis < 0) axis += x_rank;
       for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
       if (keepdims) {
         vec.emplace_back(static_cast<int64_t>(1));
@@ -194,10 +223,14 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "Output tensor.");
     AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
     AddAttr<bool>("keepdims", "Keep the dim that to reduce.").SetDefault(false);
-    AddAttr<int>("dtype", "Keep the dim that to reduce.").SetDefault(-1);
     AddAttr<bool>("flatten",
                   "Flatten the input value, and search the min or max indices")
         .SetDefault(false);
+    AddAttr<int>("dtype",
+                 "(int, 3), the dtype of indices, the indices dtype must be "
+                 "int32, int64."
+                 "default dtype is int64, and proto value is 3.")
+        .SetDefault(3);
     AddComment(string::Sprintf(R"DOC(
       %s Operator.
 
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
index 74fc3292746d26a983fa81ed8cac67b30e23d476..23ed7d727c536225a98a1ea9e6e3af723b4352c3 100644
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
 
 REGISTER_OPERATOR(
@@ -31,3 +32,20 @@ REGISTER_OP_CPU_KERNEL(
                                     int16_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
                                     uint8_t>);
+REGISTER_OP_VERSION(arg_min)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade argmin add a new attribute [flatten] and modify the attribute of dtype)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewAttr("flatten",
+                     "In order to compute the argmin over the flattened array "
+                     "when the "
+                     "argument `axis` in python API is None.",
+                     false)
+            .ModifyAttr(
+                "dtype",
+                "change the default value of dtype, the older version "
+                "is -1, means return the int64 indices."
+                "The new version is 3, return the int64 indices directly."
+                "And supporting the dtype of -1 in new version.",
+                3));
diff --git a/paddle/fluid/operators/bernoulli_op.cu b/paddle/fluid/operators/bernoulli_op.cu
index d0837071d456068f64ebc74b115f1a7904eba41c..6565f5a9a2176972e9e5085c6646097e8349f259 100644
--- a/paddle/fluid/operators/bernoulli_op.cu
+++ b/paddle/fluid/operators/bernoulli_op.cu
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 
-#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/bernoulli_op.h"
@@ -31,6 +30,10 @@ struct BernoulliCudaFunctor {
   __host__ __device__ BernoulliCudaFunctor(int seed) : seed_(seed) {}
 
   __host__ __device__ T operator()(const unsigned int n, const T p) const {
+    // NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several
+    // lines of error messages if, and it should be refined.
+    PADDLE_ENFORCE(p >= 0.0 && p <= 1.0,
+                   "The probability should be >=0 and <= 1, but got %f", p);
     thrust::minstd_rand rng;
     rng.seed(seed_);
     thrust::uniform_real_distribution<T> dist(0.0, 1.0);
diff --git a/paddle/fluid/operators/bernoulli_op.h b/paddle/fluid/operators/bernoulli_op.h
index 06a83ada17bb926d6f7d4eef10750986d00f048c..40f285d11f194057d950f45798bea07439398ab0 100644
--- a/paddle/fluid/operators/bernoulli_op.h
+++ b/paddle/fluid/operators/bernoulli_op.h
@@ -25,10 +25,12 @@ namespace operators {
 
 template <typename T>
 inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
-  PADDLE_ENFORCE_LE(p, 1, platform::errors::OutOfRange(
-                              "The probability should be <= 1, but got %f", p));
-  PADDLE_ENFORCE_GE(p, 0, platform::errors::OutOfRange(
-                              "The probability should be >= 1, but got %f", p));
+  PADDLE_ENFORCE_LE(p, 1.0,
+                    platform::errors::OutOfRange(
+                        "The probability should be <= 1, but got %f", p));
+  PADDLE_ENFORCE_GE(p, 0.0,
+                    platform::errors::OutOfRange(
+                        "The probability should be >= 0, but got %f", p));
   return static_cast<T>(rand < p);
 }
 
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
index 03abfe7eb703b021dac2261dcd9c87d440b04001..68f5d5460efd16a79d6e1553c2fb78da31fc704a 100644
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -66,7 +66,7 @@ template <typename DeviceContext, typename T>
 class ClipKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto max = context.Attr<T>("max");
+    auto max = static_cast<T>(context.Attr<float>("max"));
     Tensor max_cpu;
     if (context.HasInput("Max")) {
       auto* max_t = context.Input<Tensor>("Max");
@@ -77,8 +77,9 @@ class ClipKernel : public framework::OpKernel<T> {
       }
       max = max_data[0];
     }
+    max = static_cast<T>(max);
 
-    auto min = context.Attr<T>("min");
+    auto min = context.Attr<float>("min");
     Tensor min_cpu;
     if (context.HasInput("Min")) {
       auto* min_t = context.Input<Tensor>("Min");
@@ -141,7 +142,7 @@ template <typename DeviceContext, typename T>
 class ClipGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto max = context.Attr<T>("max");
+    auto max = static_cast<T>(context.Attr<float>("max"));
     Tensor max_cpu;
     if (context.HasInput("Max")) {
       auto* max_t = context.Input<Tensor>("Max");
@@ -152,8 +153,9 @@ class ClipGradKernel : public framework::OpKernel<T> {
       }
       max = max_data[0];
     }
+    max = static_cast<T>(max);
 
-    auto min = context.Attr<T>("min");
+    auto min = context.Attr<float>("min");
     Tensor min_cpu;
     if (context.HasInput("Min")) {
       auto* min_t = context.Input<Tensor>("Min");
@@ -164,6 +166,7 @@ class ClipGradKernel : public framework::OpKernel<T> {
       }
       min = min_data[0];
     }
+    min = static_cast<T>(min);
 
     auto* d_out =
         context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index c9c42e0938d51991c53b74ac6ad59c350f4a3ced..de77121ee3990366771723e3c43e53362c832ef7 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -62,6 +62,34 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
     gpu_dev_ctx.Wait();
 #else
     PADDLE_THROW("Unexpected branch");
+#endif
+    return true;
+  } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    auto& xpu_dev_ctx = static_cast<const platform::XPUDeviceContext&>(dev_ctx);
+    platform::CPUPlace cpu;
+    char* p = reinterpret_cast<char*>(dest);
+    while (total_written < length) {
+      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
+        return false;
+      }
+
+      if (total_written + size_to_write > length) {
+        size_to_write = length - total_written;
+      }
+
+      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place),
+                   reinterpret_cast<void*>(p), cpu, data, size_to_write);
+      p += size_to_write;
+      total_written += size_to_write;
+      input->Skip(size_to_write);
+    }
+    xpu_dev_ctx.Wait();
+#else
+    PADDLE_ENFORCE_NOT_NULL(
+        nullptr,
+        platform::errors::Unimplemented(
+            "Not supported XPU, please compile with option WITH_XPU=ON."));
 #endif
     return true;
   }
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
index 8c093d12585981ee681ae13f0d2e493197c6b9b3..6dfa2670c140fcfb4c409c0f9e9cef49c02a7064 100644
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
@@ -25,25 +25,32 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs("Ids"),
-                   "Input(Ids) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("W"),
-                   "Input(W) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
-                   "Output(Outs) of LookupTableOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInputs("Ids"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Ids) of LookupTableOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(W) of LookupTableOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutputs("Outputs"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Outs) of LookupTableOp should not be null."));
 
     auto ids_dims = ctx->GetInputsDim("Ids");
     auto table_dims = ctx->GetInputDim("W");
 
-    PADDLE_ENFORCE_EQ(table_dims.size(), 2,
-                      "Only 2 dimensions of the 'Embedding' is supported.");
+    PADDLE_ENFORCE_EQ(
+        table_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "Only 2 dimensions of the 'Embedding' is supported."));
 
     for (auto &ids_dim : ids_dims) {
       PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
-                        "The dimension of the 'Ids' tensor must be 2.");
+                        platform::errors::InvalidArgument(
+                            "The dimension of the 'Ids' tensor must be 2."));
     }
 
     auto endpoints = ctx->Attrs().Get<std::vector<std::string>>("endpoints");
+    // for fluid.embedding
     auto lookup_table_version =
         ctx->Attrs().Get<std::string>("lookup_table_version");
 
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
index a71451c78a870b71c05b41bdcfb34a85b3e2213b..6387120bc87fc94f40574a3ab7f0aabc98f41e95 100644
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
@@ -35,9 +35,30 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
     auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
     auto is_distributed = context.Attr<bool>("is_distributed");
 
+    auto lookup_table_version =
+        context.Attr<std::string>("lookup_table_version");
+
     operators::distributed::prefetchs(id_names, out_names, embedding_name,
                                       is_distributed, lookup_tables, endpoints,
                                       context, context.scope());
+
+    if (lookup_table_version == "lookup_table_v2") {
+      auto &scope = context.scope();
+      auto emb_dim =
+          scope.FindVar(embedding_name)->Get<framework::LoDTensor>().dims()[1];
+
+      for (size_t i = 0; i < id_names.size(); ++i) {
+        auto *id_var = scope.FindVar(id_names[i]);
+        auto *out_var = scope.FindVar(out_names[i]);
+        auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
+        auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
+
+        auto id_dims = id_tensor->dims();
+        out_tensor->Resize(framework::make_ddim(
+            {static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
+             static_cast<int64_t>(emb_dim)}));
+      }
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index 4d5e4c4f600314d307125f9b2031026b6aa94f10..49ad67bbca353acc4a79c9e8912d7ae5a70c0021 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -96,6 +96,42 @@ __global__ void RandomGeneratorWithSeed(const size_t n, const int* seed,
   }
 }
 
+template <typename T, typename MaskType>
+__global__ void RandomGeneratorWithGenerator(const size_t n, uint64_t seed,
+                                             const float dropout_prob,
+                                             const T* src, MaskType* mask_data,
+                                             T* dst, bool is_upscale_in_train,
+                                             uint64_t increment) {
+  curandStatePhilox4_32_10_t state;
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = 0;
+
+  MaskType mask;
+  T dest;
+  for (; idx < n; idx += blockDim.x * gridDim.x) {
+    T s = src[idx];
+    if (step_size == 0) {
+      curand_init(seed, idx, increment, &state);
+      step_size = blockDim.x * gridDim.x;
+    } else {
+      curand_init(seed, idx, increment, &state);
+    }
+    if (curand_uniform(&state) < dropout_prob) {
+      mask = 0;
+      dest = 0;
+    } else {
+      mask = 1;
+      if (is_upscale_in_train) {
+        dest = s / static_cast<T>(1.0f - dropout_prob);
+      } else {
+        dest = s;
+      }
+    }
+    mask_data[idx] = mask;
+    dst[idx] = dest;
+  }
+}
+
 // It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -150,6 +186,17 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
             context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
       }
 
+      int device_id = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace())
+                          .GetDeviceId();
+      auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+      if (gen_cuda->GetIsInitPy() && (!context.Attr<bool>("fix_seed"))) {
+        auto seed_offset = gen_cuda->IncrementOffset(1);
+        RandomGeneratorWithGenerator<T, uint8_t><<<grid, threads, 0, stream>>>(
+            size, seed_offset.first, dropout_prob, x_data, mask_data, y_data,
+            upscale_in_train, seed_offset.second);
+        return;
+      }
+
       RandomGenerator<T, uint8_t><<<grid, threads, 0, stream>>>(
           size, seed_data, dropout_prob, x_data, mask_data, y_data,
           upscale_in_train);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index 9d9eb4a82a075f27764a73d0e976dbf3f7181cb1..161c4282ec277a19c19921267eaa4cb46b859900 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -29,6 +29,10 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 template <typename DeviceContext, typename T>
 class CPUDropoutKernel : public framework::OpKernel<T> {
  public:
@@ -116,9 +120,9 @@ class DropoutGradKernel : public framework::OpKernel<T> {
     auto* mask = context.Input<Tensor>("Mask");
     grad_x->mutable_data<T>(context.GetPlace());
 
-    auto M = EigenMatrix<uint8_t>::Reshape(*mask, 1);
-    auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
-    auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
+    auto M = EigenVector<uint8_t>::Flatten(*mask);
+    auto dX = EigenVector<T>::Flatten(*grad_x);
+    auto dY = EigenVector<T>::Flatten(*grad_y);
 
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
index 47bd6af0b95ace2b9b753e38cfc5f191bc1bb942..87e940e2ed6319c4f2957cd846735adb210cd23d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
@@ -31,6 +31,15 @@ struct ModFunctor {
   }
 };
 
+template <typename T>
+struct InverseModFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    T res = b % a;
+    if ((res != 0) && ((res < 0) != (a < 0))) res += a;
+    return res;
+  }
+};
+
 template <typename T>
 struct ModFunctorFP {
   inline HOSTDEVICE T operator()(T a, T b) const {
@@ -40,13 +49,29 @@ struct ModFunctorFP {
   }
 };
 
+template <typename T>
+struct InverseModFunctorFP {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    T res = fmod(b, a);
+    if ((res != 0) && ((a < 0) != (res < 0))) res += a;
+    return res;
+  }
+};
+
 template <typename DeviceContext, typename T>
 void elementwise_mod(const framework::ExecutionContext &ctx,
                      const framework::Tensor *x, const framework::Tensor *y,
                      framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                        ModFunctor<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          ModFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseModFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseModFunctor<T>(), z);
+  }
 }
 
 template <typename DeviceContext, typename T>
@@ -54,8 +79,15 @@ void elementwise_mod_fp(const framework::ExecutionContext &ctx,
                         const framework::Tensor *x, const framework::Tensor *y,
                         framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<ModFunctorFP<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          ModFunctorFP<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<ModFunctorFP<T>, DeviceContext, T>(
+        ctx, x, y, axis, ModFunctorFP<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseModFunctorFP<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseModFunctorFP<T>(), z);
+  }
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index c144481f8dedc9317f7657a22ce82e56022d5b89..69c8b60040651179784cd6b77c31c66e892231be 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
@@ -24,15 +25,20 @@ template <typename T>
 struct GaussianGenerator {
   T mean_, std_;
   unsigned int seed_;
+  unsigned int offset_ = 0;
 
   __host__ __device__ GaussianGenerator(T mean, T std, int seed)
       : mean_(mean), std_(std), seed_(seed) {}
 
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset)
+      : mean_(mean), std_(std), seed_(seed), offset_(offset) {}
+
   __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed_);
     thrust::normal_distribution<T> dist(mean_, std_);
-    rng.discard(n);
+    unsigned int new_n = n + offset_;
+    rng.discard(new_n);
     return dist(rng);
   }
 };
@@ -43,9 +49,11 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    bool seed_flag = false;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
+      seed_flag = true;
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
@@ -56,9 +64,23 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
     int64_t size = tensor->numel();
-    thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      GaussianGenerator<T>(mean, std, seed));
+
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset));
+    } else {
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GaussianGenerator<T>(mean, std, seed));
+    }
   }
 };
 
@@ -69,17 +91,33 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    bool seed_flag = false;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
+      seed_flag = true;
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     int64_t size = tensor->numel();
-    thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      GaussianGenerator<T>(mean, std, seed));
+
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GaussianGenerator<T>(mean, std, seed_offset.first,
+                                             seed_offset.second));
+    } else {
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GaussianGenerator<T>(mean, std, seed));
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
index 8aca892a81d41b1e0a9f7f9c14169c2817ae9452..793253b6b8894de8d89b301921383ebfd53d66fc 100644
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/linspace_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -19,6 +20,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 __global__ void LinspaceKernel(T start, double step, int64_t size, T* out) {
   CUDA_KERNEL_LOOP(index, size) {
@@ -35,15 +38,27 @@ template <typename T>
 class CUDALinspaceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* start_t = context.Input<framework::Tensor>("Start");
-    auto* stop_t = context.Input<framework::Tensor>("Stop");
+    auto* pre_start = context.Input<framework::Tensor>("Start");
+    auto* pre_stop = context.Input<framework::Tensor>("Stop");
     auto* num_t = context.Input<framework::Tensor>("Num");
     auto* out = context.Output<framework::Tensor>("Out");
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+
+    Tensor start_t;
+    Tensor stop_t;
+    auto start_dtype =
+        framework::OpKernelType(pre_start->type(), context.GetPlace());
+    auto stop_dtype =
+        framework::OpKernelType(pre_stop->type(), context.GetPlace());
+    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
+    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
+    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
 
     framework::Tensor n;
-    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
+    framework::TensorCopy(start_t, platform::CPUPlace(), &n);
     T start = n.data<T>()[0];
-    framework::TensorCopy(*stop_t, platform::CPUPlace(), &n);
+    framework::TensorCopy(stop_t, platform::CPUPlace(), &n);
     T stop = n.data<T>()[0];
     framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
     int32_t num = n.data<int32_t>()[0];
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
index 9fb4960375ed7be60598d558c65310bd4a4b84bc..898f611f864dc8bfac2ba7e41b91f5f5bbe524ab 100644
--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -14,20 +14,38 @@ limitations under the License. */
 
 #pragma once
 #include <functional>
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 class CPULinspaceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
-    T stop = context.Input<framework::Tensor>("Stop")->data<T>()[0];
+    auto* pre_start = context.Input<framework::Tensor>("Start");
+    auto* pre_stop = context.Input<framework::Tensor>("Stop");
     int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
     auto* out = context.Output<framework::Tensor>("Out");
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+
+    Tensor start_t;
+    Tensor stop_t;
+    auto start_dtype =
+        framework::OpKernelType(pre_start->type(), context.GetPlace());
+    auto stop_dtype =
+        framework::OpKernelType(pre_stop->type(), context.GetPlace());
+    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
+    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
+    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
+
+    T start = start_t.data<T>()[0];
+    T stop = stop_t.data<T>()[0];
     PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
 
     out->Resize(framework::make_ddim({num}));
diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h
index 1b4db94b298c53382ee4c657e24b1b6fe6b7f62b..589df8821b3e7fc034df7504fd8d4ce802cc4ecb 100644
--- a/paddle/fluid/operators/load_combine_op.h
+++ b/paddle/fluid/operators/load_combine_op.h
@@ -70,6 +70,7 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
     auto out_vars = context.MultiOutputVar("Out");
 
     for (size_t i = 0; i < out_var_names.size(); i++) {
+      VLOG(4) << "loading tensor: " << out_var_names[i];
       PADDLE_ENFORCE_NOT_NULL(
           out_vars[i], platform::errors::InvalidArgument(
                            "The variable %s to be loaded cannot be found.",
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cc b/paddle/fluid/operators/lookup_table_v2_op.cc
index 122e01f146ccddbdc8e72aba67d47855ad30b0eb..4a6680d76c4de7f7f47445b593b1cf50cd6e1311 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_v2_op.h"
 
 #include <memory>
-
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
 
 namespace paddle {
@@ -196,3 +196,14 @@ REGISTER_OP_CPU_KERNEL(lookup_table_v2, ops::LookupTableV2Kernel<float>,
 REGISTER_OP_CPU_KERNEL(lookup_table_v2_grad,
                        ops::LookupTableV2GradKernel<float>,
                        ops::LookupTableV2GradKernel<double>);
+
+/* ==========================  register checkpoint ===========================*/
+REGISTER_OP_VERSION(lookup_table_v2)
+    .AddCheckpoint(
+        R"ROC(fix lookup_table_v2, add input type `int32`)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .BugfixWithBehaviorChanged("lookup_table_v2 support input type "
+                                       "`int64`; after support input type "
+                                       "`int32/int64`"));
+
+/* ========================================================================== */
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index b3b0f8f1960901226a2f4d5e59e7aac47907a5bf..551f0d3c6412e46deb311fac58e5b9638feb30a6 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -85,6 +85,14 @@ __global__ void LookupTableV2Grad(T *table, const T *output, const int64_t *ids,
   }
 }
 
+template <typename T>
+__global__ void InputTypeCovert(const T *in_ids, const int64_t K,
+                                int64_t *out_ids) {
+  for (int i = 0; i < K; i++) {
+    out_ids[i] = (int64_t)(in_ids[i]);
+  }
+}
+
 template <typename T>
 class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
  public:
@@ -101,23 +109,37 @@ class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
     size_t D = table_t->dims()[1];
     size_t K = ids_t->numel();
 
-    auto *ids = ids_t->data<int64_t>();
-    auto *table = table_t->data<T>();
-    auto *output = output_t->mutable_data<T>(context.GetPlace());
-
     dim3 threads(256, 4);
     dim3 grids(80, 1);
 
+    // copy GPU memory to CPU pinned memory
+    framework::Vector<int64_t> ids;
+    ids.resize(K);
+
+    const int64_t *ids_p = nullptr;
+
+    if (ids_t->type() == framework::proto::VarType::INT32) {
+      InputTypeCovert<
+          int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+          ids_t->data<int>(), K, ids.MutableData(context.GetPlace()));
+      ids_p = ids.MutableData(context.GetPlace());
+    } else {
+      ids_p = ids_t->data<int64_t>();
+    }
+
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
+
     if (padding_idx == -1)
       LookupTableV2<
           T, 256, 4, 80,
           false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+          output, table, ids_p, N, K, D, padding_idx);
     else
       LookupTableV2<
           T, 256, 4, 80,
           true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+          output, table, ids_p, N, K, D, padding_idx);
   }
 };
 
@@ -139,16 +161,24 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
 
       auto *ids_data = ids->data<int64_t>();
       int64_t ids_num = ids->numel();
-
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
       auto stream = dev_ctx.stream();
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
       new_rows.resize(ids_num);
       auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
 
-      // TODO(yuyang18): Strange code here.
-      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
-                   gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
+      if (ids->type() == framework::proto::VarType::INT32) {
+        InputTypeCovert<
+            int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+            ids->data<int>(), ids_num,
+            new_rows.MutableData(context.GetPlace()));
+      } else {
+        memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
+                     gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
+      }
+
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
@@ -177,17 +207,32 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
       int N = d_table_t->dims()[0];
       int D = d_table_t->dims()[1];
       int K = ids_t->numel();
-      const int64_t *ids = ids_t->data<int64_t>();
+
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
+      // copy GPU memory to CPU pinned memory
+      framework::Vector<int64_t> ids;
+      ids.resize(K);
+
+      const int64_t *ids_p = nullptr;
+
+      if (ids_t->type() == framework::proto::VarType::INT32) {
+        InputTypeCovert<
+            int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+            ids_t->data<int>(), K, ids.MutableData(context.GetPlace()));
+        ids_p = ids.MutableData(context.GetPlace());
+      } else {
+        ids_p = ids_t->data<int64_t>();
+      }
+
       const T *d_output = d_output_t->data<T>();
       T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
 
       auto t = framework::EigenVector<T>::Flatten(*d_table_t);
       t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
 
-      dim3 threads(128, 8);
-      dim3 grids(8, 1);
       LookupTableV2Grad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
-          d_table, d_output, ids, N, K, D);
+          d_table, d_output, ids_p, N, K, D);
     }
   }
 };
diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h
index 9aab90d84796ca5c7f37a818595ce87fb3a554b5..092c5f3b03305608f96fcc2834ad74a3388ed7ed 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <string>
 #include <vector>
 
@@ -45,84 +46,70 @@ class LookupTableV2Kernel : public framework::OpKernel<T> {
     auto *output_t = context.Output<LoDTensor>("Out");  // float tensor
     auto *table_var = context.InputVar("W");
 
-    auto id_name = context.InputNames("Ids").front();
-    auto embedding_name = context.InputNames("W").front();
-    auto out_name = context.OutputNames("Out").front();
-
-    // for remote prefetch
-    auto epmap = context.Attr<std::vector<std::string>>("epmap");
-    auto remote_prefetch = context.Attr<bool>("remote_prefetch");
-    auto table_names = context.Attr<std::vector<std::string>>("table_names");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    int64_t ids_numel = ids_t->numel();
 
-    if (remote_prefetch && !epmap.empty()) {
-// if epmap is not empty, then the parameter will be fetched from remote
-// parameter server
+    std::vector<int64_t> ids;
+    ids.reserve(ids_numel);
 
-#ifdef PADDLE_WITH_DISTRIBUTE
-      operators::distributed::prefetch(id_name, out_name, embedding_name, false,
-                                       table_names, epmap, context,
-                                       context.scope());
-#else
-      PADDLE_THROW(
-          "paddle is not compiled with distribute support, can not do "
-          "parameter prefetch!");
-#endif
+    if (ids_t->type() == framework::proto::VarType::INT32) {
+      std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_numel,
+                     std::back_inserter(ids),
+                     [&](int id) { return static_cast<int64_t>(id); });
     } else {
-      int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-      int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
-      int64_t ids_numel = ids_t->numel();
-
-      if (table_var->IsType<LoDTensor>()) {
-        auto *table_t = context.Input<LoDTensor>("W");
-        int64_t row_number = table_t->dims()[0];
-        int64_t row_width = table_t->dims()[1];
-
-        auto *table = table_t->data<T>();
-        auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-        for (int64_t i = 0; i < ids_numel; ++i) {
-          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-            memset(output + i * row_width, 0, row_width * sizeof(T));
-          } else {
-            PADDLE_ENFORCE_LT(
-                ids[i], row_number,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0 and < %ld, but got %ld. Please check input "
-                "value.",
-                row_number, ids[i]);
-            PADDLE_ENFORCE_GE(
-                ids[i], 0,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0 and < %ld, but got %ld. Please check input "
-                "value.",
-                row_number, ids[i]);
-            memcpy(output + i * row_width, table + ids[i] * row_width,
-                   row_width * sizeof(T));
-          }
+      framework::TensorToVector(*ids_t, &ids);
+    }
+
+    if (table_var->IsType<LoDTensor>()) {
+      auto *table_t = context.Input<LoDTensor>("W");
+      int64_t row_number = table_t->dims()[0];
+      int64_t row_width = table_t->dims()[1];
+
+      auto *table = table_t->data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+      for (int64_t i = 0; i < ids_numel; ++i) {
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(
+              ids[i], row_number,
+              "Variable value (input) of OP(fluid.layers.embedding) "
+              "expected >= 0 and < %ld, but got %ld. Please check input "
+              "value.",
+              row_number, ids[i]);
+          PADDLE_ENFORCE_GE(
+              ids[i], 0,
+              "Variable value (input) of OP(fluid.layers.embedding) "
+              "expected >= 0 and < %ld, but got %ld. Please check input "
+              "value.",
+              row_number, ids[i]);
+          memcpy(output + i * row_width, table + ids[i] * row_width,
+                 row_width * sizeof(T));
         }
-      } else if (table_var->IsType<SelectedRows>()) {
-        const auto &table_t = table_var->Get<SelectedRows>();
-        int64_t row_width = table_t.value().dims()[1];
-        const auto *table = table_t.value().data<T>();
-        auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-        auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-        for (int64_t i = 0; i < ids_numel; ++i) {
-          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-            memset(output + i * row_width, 0, row_width * sizeof(T));
-          } else {
-            PADDLE_ENFORCE_GE(
-                ids[i], 0,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0. But received %ld",
-                ids[i]);
-            auto id_index = table_t.Index(ids[i]);
-            PADDLE_ENFORCE_GE(
-                id_index, 0, "the input key should be exists. But received %d.",
-                id_index);
-            blas.VCOPY(row_width, table + id_index * row_width,
-                       output + i * row_width);
-          }
+      }
+    } else if (table_var->IsType<SelectedRows>()) {
+      const auto &table_t = table_var->Get<SelectedRows>();
+      int64_t row_width = table_t.value().dims()[1];
+      const auto *table = table_t.value().data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      for (int64_t i = 0; i < ids_numel; ++i) {
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_GE(
+              ids[i], 0,
+              "Variable value (input) of OP(fluid.layers.embedding) "
+              "expected >= 0. But received %ld",
+              ids[i]);
+          auto id_index = table_t.Index(ids[i]);
+          PADDLE_ENFORCE_GE(id_index, 0,
+                            "the input key should be exists. But received %d.",
+                            id_index);
+          blas.VCOPY(row_width, table + id_index * row_width,
+                     output + i * row_width);
         }
       }
     }
@@ -151,17 +138,23 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
-      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *ids_t = context.Input<LoDTensor>("Ids");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
       auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      int64_t ids_num = ids_t->numel();
+
+      std::vector<int64_t> ids;
+      ids.reserve(ids_num);
 
-      auto *ids_data = ids->data<int64_t>();
-      int64_t ids_num = ids->numel();
+      if (ids_t->type() == framework::proto::VarType::INT32) {
+        std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_num,
+                       std::back_inserter(ids),
+                       [&](int id) { return static_cast<int64_t>(id); });
+      } else {
+        framework::TensorToVector(*ids_t, &ids);
+      }
 
-      std::vector<int64_t> new_rows;
-      new_rows.resize(ids_num);
-      std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t));
-      d_table->set_rows(new_rows);
+      d_table->set_rows(ids);
 
       auto *d_table_value = d_table->mutable_value();
       d_table_value->Resize({ids_num, table_dim[1]});
@@ -185,11 +178,23 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
       memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
 
     } else {
-      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *ids_t = context.Input<LoDTensor>("Ids");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
       auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
+      int64_t ids_num = ids_t->numel();
+
+      std::vector<int64_t> ids;
+      ids.reserve(ids_num);
+
+      if (ids_t->type() == framework::proto::VarType::INT32) {
+        std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_num,
+                       std::back_inserter(ids),
+                       [&](int id) { return static_cast<int64_t>(id); });
+      } else {
+        framework::TensorToVector(*ids_t, &ids);
+      }
 
-      auto *ids_data = ids->data<int64_t>();
+      auto *ids_data = ids.data();
 
       int64_t N = table_dim[0];
       int64_t D = table_dim[1];
@@ -199,7 +204,7 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
 
       memset(d_table_data, 0, d_table->numel() * sizeof(T));
 
-      for (int64_t i = 0; i < ids->numel(); ++i) {
+      for (int64_t i = 0; i < ids_num; ++i) {
         if (padding_idx != kNoPadding && ids_data[i] == padding_idx) {
           // the gradient of padding_idx should be 0, already done by memset, so
           // do nothing.
diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h
index 3a5eddcbf4af699a89ae1a21571337155699a1f3..18d9a6310dd6c09905ca7fa84d98f391a84dfa2d 100644
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -65,13 +65,14 @@ class SplitFunctor {
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_ALL_TYPES(macro) \
-  macro(int);                \
-  macro(float);              \
-  macro(double);             \
-  macro(bool);               \
-  macro(int64_t);            \
-  macro(int16_t);            \
-  macro(uint8_t);            \
-  macro(int8_t);             \
-  macro(::paddle::platform::float16)
+#define FOR_ALL_TYPES(macro)          \
+  macro(int);                         \
+  macro(float);                       \
+  macro(double);                      \
+  macro(bool);                        \
+  macro(int64_t);                     \
+  macro(int16_t);                     \
+  macro(uint8_t);                     \
+  macro(int8_t);                      \
+  macro(::paddle::platform::float16); \
+  macro(::paddle::platform::bfloat16)
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 6748d0ab43f70f997b3008f34f4be743b81e8946..824e66b1eb4ae05cc74dc1cd8c21f16f286592e6 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -34,6 +34,7 @@ namespace math {
 using float16 = paddle::platform::float16;
 
 template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
 template struct SetConstant<platform::CPUDeviceContext, float>;
 template struct SetConstant<platform::CPUDeviceContext, double>;
 template struct SetConstant<platform::CPUDeviceContext, int>;
@@ -41,16 +42,18 @@ template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
 template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
 
-#define DEFINE_CPU_TRANS(RANK)                                             \
-  template struct Transpose<platform::CPUDeviceContext, platform::float16, \
-                            RANK>;                                         \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;      \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;       \
-  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;    \
+#define DEFINE_CPU_TRANS(RANK)                                              \
+  template struct Transpose<platform::CPUDeviceContext, platform::float16,  \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, platform::bfloat16, \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;       \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;         \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;     \
   template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;
 
 DEFINE_CPU_TRANS(1);
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 5ca9216d0c8d6b3f773a1eb1a0cec216ca6ed4f3..487deb11b48687a91174c8d9baf072a5ca929de8 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -33,10 +33,12 @@ class MKLDNNActivationKernel
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *x = ctx.Input<Tensor>("X");
-    PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for X tensor");
-    PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for X tensor");
+    PADDLE_ENFORCE_EQ(
+        x->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument("Wrong layout set for X tensor"));
+    PADDLE_ENFORCE_NE(
+        x->format(), MKLDNNMemoryFormat::undef,
+        platform::errors::InvalidArgument("Wrong format set for X tensor"));
 
     Functor functor;
     functor(ctx);
@@ -50,9 +52,11 @@ class MKLDNNActivationGradKernel
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
     PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input OutGrad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input OutGrad tensor"));
     PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input OutGrad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input OutGrad tensor"));
 
     Functor functor;
     functor(ctx);
@@ -82,7 +86,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
 
   PADDLE_ENFORCE(
       x->dims().size() == 2 || x->dims().size() == 3 || x->dims().size() == 4,
-      "Input dim must be with 2, 3 or 4");
+      platform::errors::Unimplemented("Input dim must be with 2, 3 or 4"));
 
   auto src_tz = framework::vectorize<int64_t>(x->dims());
 
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index 33cf00b2c01da8e346e4c7e6be81fce3fd47f54f..8a02a697cbb21b28e14f19c6202ae0777b5102de 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -262,9 +262,11 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input diff_y tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input diff_y tensor"));
     PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input diff_y tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input diff_y tensor"));
 
     auto src_tz = paddle::framework::vectorize<int64_t>(x->dims());
     auto scale_tz = paddle::framework::vectorize<int64_t>(scale->dims());
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 40f64800a0b81a161805857cb3e0a3855f386720..3cafb0e9fc6147626f066bbeba1b10d074a37b87 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -30,10 +30,12 @@ using platform::to_void_cast;
 
 static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
   for (auto* input : inputs) {
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input tensor");
+    PADDLE_ENFORCE_EQ(
+        input->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument("Wrong layout set for Input tensor"));
+    PADDLE_ENFORCE_NE(
+        input->format(), MKLDNNMemoryFormat::undef,
+        platform::errors::InvalidArgument("Wrong format set for Input tensor"));
   }
 }
 
@@ -49,7 +51,7 @@ static platform::CPUPlace GetCpuPlace(
     const paddle::framework::ExecutionContext& ctx) {
   auto place = ctx.GetPlace();
   PADDLE_ENFORCE(paddle::platform::is_cpu_place(place),
-                 "It must use CPUPlace.");
+                 platform::errors::InvalidArgument("It must use CPUPlace."));
   return BOOST_GET_CONST(platform::CPUPlace, place);
 }
 
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 7d99bb7d2b7a7049c67788df4c507afc14880815..19ee8764e27b235a2fa8e0720c11bce601b030db 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -561,7 +561,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
       PADDLE_ENFORCE_EQ(
           !fuse_residual_conn || !force_fp32_output, true,
-          "residual fusion does not support force output with fp32");
+          platform::errors::Unimplemented(
+              "residual fusion does not support force output with fp32"));
 
       auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
 
@@ -625,7 +626,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
               ? dilations.size() == 3 && dilations[0] == 1 &&
                     dilations[1] == 1 && dilations[2] == 1
               : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-          true, "dilation in convolution is not implemented yet");
+          true, platform::errors::Unimplemented(
+                    "dilation in convolution is not implemented yet"));
 
       const K* filter_data = filter->data<K>();
       auto scale_in_data = ctx.Attr<float>("Scale_in");
@@ -887,7 +889,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
             "The output_grad tensor's layout should be %d, but got %d.",
             DataLayout::kMKLDNN, output_grad->layout()));
     PADDLE_ENFORCE_NE(output_grad->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for output_grad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for output_grad tensor"));
 
     PADDLE_ENFORCE_EQ(
         ctx.Attr<bool>("is_test"), false,
@@ -1052,7 +1055,11 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       astream.wait();
 
       filter_grad->set_layout(DataLayout::kMKLDNN);
-      filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
+      // in OneDNN groups in convolution are treated as separate dimension
+      // which is not the case in paddlepaddle
+      auto filter_fmt = GetMKLDNNFormat(*diff_weights_memory_p);
+      filter_grad->set_format(platform::MKLDNNFormatForSize(
+          g > 1 ? weights_tz.size() - 1 : weights_tz.size(), filter_fmt));
     }
     if (input_grad) {
       auto weights_memory_p = handler.AcquireWeightsMemoryFromDataPrimitive(
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 48279658c80e93428f940c40e61d7b9af23f4ee3..56537900216a8a4e4e96791123c7d50da621ab62 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -117,7 +117,8 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     PADDLE_ENFORCE(
         dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-        "dilation in convolution is not implemented yet");
+        platform::errors::Unimplemented(
+            "dilation in convolution is not implemented yet"));
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index 2a8b332521804ccebdbd4e6914b2763abfb5dbdc..9df30b3295c00e69a956ee84770dfeb19a83487c 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -83,19 +83,24 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    PADDLE_ENFORCE_EQ(in_x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(in_x->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input tensor");
+    PADDLE_ENFORCE_EQ(
+        in_x->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument("Wrong layout set for Input tensor"));
+    PADDLE_ENFORCE_NE(
+        in_x->format(), MKLDNNMemoryFormat::undef,
+        platform::errors::InvalidArgument("Wrong format set for Input tensor"));
 
     PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input output_grad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input output_grad tensor"));
     PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input output_grad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input output_grad tensor"));
 
     PADDLE_ENFORCE_EQ(
         ctx.Attr<bool>("is_test"), false,
-        "is_test attribute should be set to False in training phase.");
+        platform::errors::InvalidArgument(
+            "is_test attribute should be set to False in training phase."));
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
 
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 4d825e4ee279bc2c505cfabff1917d1a5319d1dd..5014381a4e215917883f45288de4482db5cbf79c 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -140,7 +140,8 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
 
     PADDLE_ENFORCE_EQ(
         dout->dims(), dx->dims(),
-        "The shape of softmax_grad's input and output must be identical.");
+        platform::errors::InvalidArgument(
+            "The shape of softmax_grad's input and output must be identical."));
 
     auto dims = dout->dims();  // input and output share the same shape
     const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
diff --git a/paddle/fluid/operators/randint_op.cu b/paddle/fluid/operators/randint_op.cu
index a07a92621e6b3726be518df6abcec58257a91489..40e390b0b87246bbaa8474262df8ba5576297385 100644
--- a/paddle/fluid/operators/randint_op.cu
+++ b/paddle/fluid/operators/randint_op.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/uniform_random_op.h"
 
@@ -49,15 +50,23 @@ class GPURandintKernel : public framework::OpKernel<T> {
 
     int64_t size = out->numel();
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+
+    /*
     std::minstd_rand engine;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
     }
     engine.seed(seed);
+    */
+
     std::uniform_int_distribution<> dist(context.Attr<int>("low"),
                                          context.Attr<int>("high") - 1);
-    for (int64_t i = 0; i < size; ++i) data[i] = dist(engine);
+    auto engine = framework::GetCPURandomEngine(seed);
+
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
+    }
 
     if (platform::is_gpu_place(context.GetPlace())) {
       // Copy tensor to out
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu
index 5a3510babe4d57b9e80f0e7898df98033834ca15..a838c30771a5c1229061a58b12c6777a3d24c6f3 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include <limits>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
@@ -46,6 +47,37 @@ struct TruncatedNormal {
   }
 };
 
+template <typename T>
+struct TruncatedNormalOffset {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  unsigned int seed;
+  T numeric_min;
+  int offset_;
+
+  __host__ __device__ TruncatedNormalOffset(T mean, T std, T numeric_min,
+                                            int seed, int offset)
+      : mean(mean),
+        std(std),
+        seed(seed),
+        numeric_min(numeric_min),
+        offset_(offset) {
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
+  }
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
+    rng.discard(n);
+    T value = dist(rng);
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
+  }
+};
+
 template <typename T>
 class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
  public:
@@ -54,14 +86,31 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    bool seed_flag = false;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
+      seed_flag = true;
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     int64_t size = tensor->numel();
+
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          TruncatedNormalOffset<T>(mean, std, std::numeric_limits<T>::min(),
+                                   seed_offset.first, seed_offset.second));
+    }
+
     thrust::transform(
         index_sequence_begin, index_sequence_begin + size,
         thrust::device_ptr<T>(data),
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 4df1e0ffeb97564803f452114d52ab03d0464f8a..6237137cccbc6840b345c9e26dda1ccdc8df43b0 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -51,6 +51,39 @@ struct UniformGenerator {
   }
 };
 
+template <typename T>
+struct UniformGeneratorOffset {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  int offset_;
+  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
+                                             int diag_num, int diag_step,
+                                             T diag_val, int offset)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val),
+        offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n + offset_);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
 // It seems that Eigen::Tensor::random in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -89,10 +122,11 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-
+    bool seed_flag = false;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
+      seed_flag = true;
     }
 
     T min = static_cast<T>(context.Attr<float>("min"));
@@ -104,10 +138,23 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     int64_t size = tensor->numel();
-    thrust::transform(
-        index_sequence_begin, index_sequence_begin + size,
-        thrust::device_ptr<T>(data),
-        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
+                                    diag_step, diag_val, gen_offset));
+    } else {
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index c33e7c6068648d019a38450a92fec79032411598..ee1361e3618302816200efc759ebd18ee05c9274 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unsqueeze_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -327,6 +329,7 @@ REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
 REGISTER_OP_CPU_KERNEL(
     unsqueeze, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -334,12 +337,14 @@ REGISTER_OP_CPU_KERNEL(
     unsqueeze_grad,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     unsqueeze2, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -347,6 +352,7 @@ REGISTER_OP_CPU_KERNEL(
     unsqueeze2_grad,
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cu.cc b/paddle/fluid/operators/unsqueeze_op.cu.cc
index 3258de53b8b7cda994c9555bf6a62502f3c04c23..0e8f47a692380cc96a371bb7a5319af89a3d28c4 100644
--- a/paddle/fluid/operators/unsqueeze_op.cu.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cu.cc
@@ -21,6 +21,7 @@ REGISTER_OP_CUDA_KERNEL(
     unsqueeze, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -30,6 +31,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
                              plat::float16>,
+    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -38,6 +40,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -47,6 +50,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
                               plat::float16>,
+    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 652b4dd47daa8aecdcae43e8c910d7dd61bbb64d..ef827fd74903afd007c864307e942749e3eb0bd1 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -136,6 +136,8 @@ cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
 
+cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
+
 nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
 
 nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..742329abb2dae20437120c0d4ba5975d41b0a7c9
--- /dev/null
+++ b/paddle/fluid/platform/bfloat16.h
@@ -0,0 +1,439 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <limits>
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
+
+#include <cstring>
+#include "paddle/fluid/platform/hostdevice.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace platform {
+
+struct PADDLE_ALIGN(2) bfloat16 {
+ public:
+  uint16_t x;
+
+  bfloat16() = default;
+  bfloat16(const bfloat16& o) = default;
+  bfloat16& operator=(const bfloat16& o) = default;
+  bfloat16(bfloat16&& o) = default;
+  bfloat16& operator=(bfloat16&& o) = default;
+  ~bfloat16() = default;
+
+  HOSTDEVICE inline explicit bfloat16(float val) {
+    std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+  }
+
+  template <class T>
+  HOSTDEVICE inline explicit bfloat16(const T& val)
+      : x(bfloat16(static_cast<float>(val)).x) {}
+
+  HOSTDEVICE inline bfloat16& operator=(bool b) {
+    x = b ? 0x3f80 : 0;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(float val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(double val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline explicit operator float() const {
+    float val = 0.f;
+    uint16_t temp = x;
+    memcpy(reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp),
+           2);
+    return val;
+  }
+
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(static_cast<float>(*this));
+  }
+};
+
+HOSTDEVICE inline bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator-(const bfloat16& a) {
+  bfloat16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+HOSTDEVICE inline bfloat16& operator+=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) + static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator-=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) - static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator*=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) * static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator/=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) / static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
+  bfloat16 res;
+  res.x = a;
+  return res;
+}
+
+HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) == static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator!=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator<(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator<=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator>(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator>=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+  return (a.x & 0x7FFF) > 0x7F80;
+}
+
+HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+  return (a.x & 0x7F80) == 0x7F80;
+}
+
+HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+  return !((isnan)(a)) && !((isinf)(a));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) {
+  os << a.x;
+  return os;
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+namespace std {
+
+template <>
+struct is_pod<paddle::platform::bfloat16> {
+  static const bool value =
+      is_trivial<paddle::platform::bfloat16>::value &&
+      is_standard_layout<paddle::platform::bfloat16>::value;
+};
+
+template <>
+struct is_floating_point<paddle::platform::bfloat16>
+    : std::integral_constant<
+          bool, std::is_same<paddle::platform::bfloat16,
+                             typename std::remove_cv<
+                                 paddle::platform::bfloat16>::type>::value> {};
+template <>
+struct is_signed<paddle::platform::bfloat16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<paddle::platform::bfloat16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const paddle::platform::bfloat16& a) {
+  return paddle::platform::isnan(a);
+}
+
+inline bool isinf(const paddle::platform::bfloat16& a) {
+  return paddle::platform::isinf(a);
+}
+
+template <>
+struct numeric_limits<paddle::platform::bfloat16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 8;
+  static const int digits10 = 2;
+  static const int max_digits10 = 9;
+  static const int radix = 2;
+  static const int min_exponent = -125;
+  static const int min_exponent10 = -37;
+  static const int max_exponent = 128;
+  static const int max_exponent10 = 38;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static paddle::platform::bfloat16(min)() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x007f);
+  }
+  static paddle::platform::bfloat16 lowest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
+  }
+  static paddle::platform::bfloat16(max)() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
+  }
+  static paddle::platform::bfloat16 epsilon() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
+  }
+  static paddle::platform::bfloat16 round_error() {
+    return paddle::platform::bfloat16(0.5);
+  }
+  static paddle::platform::bfloat16 infinity() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
+  }
+  static paddle::platform::bfloat16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
+  }
+  static paddle::platform::bfloat16 signaling_NaN() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xff81);
+  }
+  static paddle::platform::bfloat16 denorm_min() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x0001);
+  }
+};
+
+}  // namespace std
+
+namespace Eigen {
+
+using bfloat16 = paddle::platform::bfloat16;
+
+template <>
+struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  HOSTDEVICE static inline bfloat16 epsilon() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
+  }
+  HOSTDEVICE static inline bfloat16 dummy_precision() {
+    return bfloat16(1e-5f);
+  }
+  HOSTDEVICE static inline bfloat16 highest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
+  }
+  HOSTDEVICE static inline bfloat16 lowest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
+  }
+  HOSTDEVICE static inline bfloat16 infinity() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
+  }
+  HOSTDEVICE static inline bfloat16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
+  }
+};
+namespace numext {
+
+template <>
+HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline bfloat16 exp(const bfloat16& a) {
+  return bfloat16(::expf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 erf(const bfloat16& a) {
+  return bfloat16(::erff(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 log(const bfloat16& a) {
+  return bfloat16(::logf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) {
+  return bfloat16(::tanhf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) {
+  return bfloat16(::sqrtf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) {
+  return bfloat16(::ceilf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 floor(const bfloat16& a) {
+  return bfloat16(::floorf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 round(const bfloat16& a) {
+  return bfloat16(::roundf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(::powf(static_cast<float>(a), static_cast<float>(b)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 abs(const bfloat16& a) {
+  return bfloat16(::fabs(static_cast<float>(a)));
+}
+
+}  // namespace numext
+}  // namespace Eigen
diff --git a/paddle/fluid/platform/bfloat16_test.cc b/paddle/fluid/platform/bfloat16_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bdb508ee33630004daae132fcdcf71146a50e640
--- /dev/null
+++ b/paddle/fluid/platform/bfloat16_test.cc
@@ -0,0 +1,162 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/bfloat16.h"
+
+#include <vector>
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/init.h"
+
+namespace paddle {
+namespace platform {
+
+using bfloat16 = paddle::platform::bfloat16;
+
+TEST(bfloat16, conversion_cpu) {
+  // Conversion from float
+  EXPECT_EQ(bfloat16(1.0f).x, 0x3f80);
+  EXPECT_EQ(bfloat16(0.5f).x, 0x3f00);
+  EXPECT_EQ(bfloat16(0.33333f).x, 0x3eaa);
+  EXPECT_EQ(bfloat16(0.0f).x, 0x0000);
+  EXPECT_EQ(bfloat16(-0.0f).x, 0x8000);
+  EXPECT_EQ(bfloat16(65504.0f).x, 0x477f);
+  EXPECT_EQ(bfloat16(65536.0f).x, 0x4780);
+
+  // Conversion from double
+  EXPECT_EQ(bfloat16(1.0).x, 0x3f80);
+  EXPECT_EQ(bfloat16(0.5).x, 0x3f00);
+  EXPECT_EQ(bfloat16(0.33333).x, 0x3eaa);
+  EXPECT_EQ(bfloat16(0.0).x, 0x0000);
+  EXPECT_EQ(bfloat16(-0.0).x, 0x8000);
+  EXPECT_EQ(bfloat16(65504.0).x, 0x477f);
+  EXPECT_EQ(bfloat16(65536.0).x, 0x4780);
+
+  // Conversion from int
+  EXPECT_EQ(bfloat16(-1).x, 0xbf80);
+  EXPECT_EQ(bfloat16(0).x, 0x0000);
+  EXPECT_EQ(bfloat16(1).x, 0x3f80);
+  EXPECT_EQ(bfloat16(2).x, 0x4000);
+  EXPECT_EQ(bfloat16(3).x, 0x4040);
+
+  // Conversion from bool
+  EXPECT_EQ(bfloat16(true).x, 0x3f80);
+  EXPECT_EQ(bfloat16(false).x, 0x0000);
+
+  // Assignment operator
+  bfloat16 v_assign;
+  v_assign = bfloat16(0.f);
+  EXPECT_EQ(v_assign.x, 0x0000);
+  v_assign = 0.5f;
+  EXPECT_EQ(v_assign.x, 0x3f00);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3eaa);
+  v_assign = -1;
+  EXPECT_EQ(v_assign.x, 0xbf80);
+
+  // Conversion operator
+  EXPECT_EQ(static_cast<float>(bfloat16(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(bfloat16(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(bfloat16(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(bfloat16(true)), true);
+}
+
+TEST(bfloat16, arithmetic_cpu) {
+  EXPECT_NEAR(static_cast<float>(bfloat16(1) + bfloat16(1)), 2, 0.001);
+  EXPECT_EQ(static_cast<float>(bfloat16(5) + bfloat16(-5)), 0);
+  EXPECT_NEAR(static_cast<float>(bfloat16(0.33333f) + bfloat16(0.66667f)), 1.0f,
+              0.01);
+  EXPECT_EQ(static_cast<float>(bfloat16(3) - bfloat16(5)), -2);
+  EXPECT_NEAR(static_cast<float>(bfloat16(0.66667f) - bfloat16(0.33333f)),
+              0.33334f, 0.01);
+  EXPECT_NEAR(static_cast<float>(bfloat16(3.3f) * bfloat16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(static_cast<float>(bfloat16(-2.1f) * bfloat16(-3.0f)), 6.3f, 0.1);
+  EXPECT_NEAR(static_cast<float>(bfloat16(2.0f) / bfloat16(3.0f)), 0.66667f,
+              0.01);
+  EXPECT_EQ(static_cast<float>(bfloat16(1.0f) / bfloat16(2.0f)), 0.5f);
+  EXPECT_EQ(static_cast<float>(-bfloat16(512.0f)), -512.0f);
+  EXPECT_EQ(static_cast<float>(-bfloat16(-512.0f)), 512.0f);
+}
+
+TEST(bfloat16, comparison_cpu) {
+  EXPECT_TRUE(bfloat16(1.0f) == bfloat16(1.0f));
+  EXPECT_FALSE(bfloat16(-1.0f) == bfloat16(-0.5f));
+  EXPECT_TRUE(bfloat16(1.0f) != bfloat16(0.5f));
+  EXPECT_FALSE(bfloat16(-1.0f) != bfloat16(-1.0f));
+  EXPECT_TRUE(bfloat16(1.0f) < bfloat16(2.0f));
+  EXPECT_FALSE(bfloat16(-1.0f) < bfloat16(-1.0f));
+  EXPECT_TRUE(bfloat16(1.0f) <= bfloat16(1.0f));
+  EXPECT_TRUE(bfloat16(2.0f) > bfloat16(1.0f));
+  EXPECT_FALSE(bfloat16(-2.0f) > bfloat16(-2.0f));
+  EXPECT_TRUE(bfloat16(2.0f) >= bfloat16(2.0f));
+}
+
+TEST(bfloat16, lod_tensor_cpu) {
+  framework::LoDTensor lod_tensor;
+
+  std::vector<bfloat16> input_data = {bfloat16(1.0f), bfloat16(0.5f),
+                                      bfloat16(0.33333f), bfloat16(0.0f)};
+  EXPECT_EQ(input_data[0].x, 0x3f80);
+  EXPECT_EQ(input_data[1].x, 0x3f00);
+  EXPECT_EQ(input_data[2].x, 0x3eaa);
+  EXPECT_EQ(input_data[3].x, 0x0000);
+
+  lod_tensor.Resize({4, 1});
+  lod_tensor.set_lod(framework::LoD({{0, 2, 4}}));
+  bfloat16* data_ptr = lod_tensor.mutable_data<bfloat16>(CPUPlace());
+
+  EXPECT_NE(data_ptr, nullptr);
+  EXPECT_EQ(input_data.size(), static_cast<size_t>(lod_tensor.numel()));
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    data_ptr[i] = input_data[i];
+    EXPECT_EQ(data_ptr[i].x, input_data[i].x);
+  }
+}
+
+TEST(bfloat16, floating) {
+  // compile time assert.
+  PADDLE_ENFORCE_EQ(
+      std::is_floating_point<bfloat16>::value, true,
+      platform::errors::Fatal("std::is_floating_point with bfloat16 data type "
+                              "should be equal to true but it is not"));
+}
+
+TEST(bfloat16, print) {
+  bfloat16 a = bfloat16(1.0f);
+  std::cout << a << std::endl;
+}
+
+// CPU test
+TEST(bfloat16, isinf) {
+  bfloat16 a;
+  a.x = 0x7f80;
+  bfloat16 b = bfloat16(INFINITY);
+  bfloat16 c = static_cast<bfloat16>(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(c), true);
+}
+
+TEST(bfloat16, isnan) {
+  bfloat16 a;
+  a.x = 0x7fff;
+  bfloat16 b = bfloat16(NAN);
+  bfloat16 c = static_cast<bfloat16>(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(c), true);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 7e32720c1d733411178c102d5c4500f722e7d005..562e7542012247c86add9e64f182d857ea969c60 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -38,14 +38,15 @@ extern void *cublas_dso_handle;
  */
 #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
   struct DynLoad__##__name {                                                 \
-    using FUNC_TYPE = decltype(&::__name);                                   \
     template <typename... Args>                                              \
-    inline cublasStatus_t operator()(Args... args) {                         \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {  \
+      using cublas_func =                                                    \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);          \
       std::call_once(cublas_dso_flag, []() {                                 \
         cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
       });                                                                    \
       static void *p_##__name = dlsym(cublas_dso_handle, #__name);           \
-      return reinterpret_cast<FUNC_TYPE>(p_##__name)(args...);               \
+      return reinterpret_cast<cublas_func>(p_##__name)(args...);             \
     }                                                                        \
   };                                                                         \
   extern DynLoad__##__name __name
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 3782eb684f21f8c09e9dac124082ae596fe5d1bc..8fb66c6f34bd8453f1aceb731bb1cd94b8e75a69 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -161,6 +161,12 @@ inline mkldnn::memory::data_type MKLDNNGetDataType<uint8_t>() {
   return mkldnn::memory::data_type::u8;
 }
 
+template <>
+inline mkldnn::memory::data_type
+MKLDNNGetDataType<paddle::platform::bfloat16>() {
+  return mkldnn::memory::data_type::bf16;
+}
+
 inline void Reorder(mkldnn::memory src, mkldnn::memory dst,
                     const mkldnn::engine& engine) {
   auto reorder_prim = mkldnn::reorder(src, dst);
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 90b7f501052530a306ba22ea6a244f0ef8fad563..67121e24089f7c6c5b8de985da89039eca85f094 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -59,6 +59,7 @@ void BindGenerator(py::module* m_ptr) {
       .def_property("_is_init_py", &framework::Generator::GetIsInitPy,
                     &framework::Generator::SetIsInitPy);
   m.def("default_cpu_generator", &framework::DefaultCPUGenerator);
-}  // end Generator
-}  // end namespace pybind
+  m.def("default_cuda_generator", &framework::GetDefaultCUDAGenerator);
+}
+}  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 4377a8c2cef5aab7a200955cd25830d448014817..5ee15073267b6eac8978022a70ead5d0f439c62f 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "pybind11/numpy.h"
@@ -104,6 +105,7 @@ struct ValidDTypeToPyArrayChecker {
   }
 
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::float16);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::bfloat16);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(float);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(double);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(bool);
@@ -119,6 +121,9 @@ inline std::string TensorDTypeToPyDTypeStr(
   if (type == proto_type) {                                                 \
     if (std::is_same<T, platform::float16>::value) {                        \
       return "e";                                                           \
+    } else if (std::is_same<T, platform::bfloat16>::value) {                \
+      /* NumPy character code of uint16 due to no support for bfloat16 */   \
+      return "H";                                                           \
     } else {                                                                \
       constexpr auto kIsValidDType = ValidDTypeToPyArrayChecker<T>::kValue; \
       PADDLE_ENFORCE_EQ(                                                    \
@@ -262,10 +267,10 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj,
     SetTensorFromPyArrayT<paddle::platform::float16, P>(self, array, place,
                                                         zero_copy);
   } else if (py::isinstance<py::array_t<uint16_t>>(array)) {
-    // TODO(cql): temporary keeping uint16, which is used for casting float16
-    // before. It should be depracated later.
-    SetTensorFromPyArrayT<paddle::platform::float16, P>(self, array, place,
-                                                        zero_copy);
+    // since there is still no support for bfloat16 in NumPy,
+    // uint16 is used for casting bfloat16
+    SetTensorFromPyArrayT<paddle::platform::bfloat16, P>(self, array, place,
+                                                         zero_copy);
   } else if (py::isinstance<py::array_t<bool>>(array)) {
     SetTensorFromPyArrayT<bool, P>(self, array, place, zero_copy);
   } else {
@@ -479,6 +484,8 @@ inline framework::Tensor *_sliceTensor(const framework::Tensor &self,
   switch (src_type) {
     case framework::proto::VarType::FP16:
       return _sliceAndConcat<paddle::platform::float16>(self, obj, dim);
+    case framework::proto::VarType::BF16:
+      return _sliceAndConcat<paddle::platform::bfloat16>(self, obj, dim);
     case framework::proto::VarType::FP32:
       return _sliceAndConcat<float>(self, obj, dim);
     case framework::proto::VarType::FP64:
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index 1f88eb2109aa23b6b60104451908b0a70c41c898..7eab677fac1683fdc95c9e338b1099d78b5cabc3 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -29,8 +29,10 @@ function(train_test TARGET_NAME)
                 PROPERTIES DEPENDS test_${TARGET_NAME})
         set_tests_properties(test_train_${TARGET_NAME}${arg}
                 PROPERTIES LABELS "RUN_TYPE=DIST")
-        set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES TIMEOUT 150)
+        if(NOT WIN32 AND NOT APPLE)
+            set_tests_properties(test_train_${TARGET_NAME}${arg}
+                    PROPERTIES TIMEOUT 150)
+        endif()
     endforeach()
 endfunction(train_test)
 
diff --git a/paddle/http.log b/paddle/http.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index cfb59a04f8147f5c09aa08a01bcd304bf8ccc120..f9ec40c1830655d2ccfe1b71270e94341e875fc5 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -20,13 +20,12 @@ rem       Paddle CI Task On Windows Platform
 rem =================================================
 
 rem -------clean up environment-----------
+wmic process where name="op_function_generator.exe" call terminate  2>NUL
 set work_dir=%cd%
-if exist build rmdir build /s/q
 mkdir build
 cd /d build
 tree .
 dir paddle\fluid\pybind\Release
-taskkill /f /im op_function_generator.exe  2>NUL
 
 rem ------initialize the virtual environment------
 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
@@ -216,7 +215,7 @@ pip install -U %PADDLE_WHL_FILE_WIN% --user
 if %ERRORLEVEL% NEQ 0 (
     call paddle_winci\Scripts\deactivate.bat 2>NUL
     echo pip install whl package failed!
-    exit /b 3
+    exit /b 1
 )
 
 python %work_dir%\paddle\scripts\installation_validate.py
@@ -225,7 +224,7 @@ goto:eof
 :test_whl_pacakage_error
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 echo Test import paddle failed, will exit!
-exit /b 3
+exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
 :unit_test
@@ -248,6 +247,9 @@ goto:eof
 
 :unit_test_error
 call paddle_winci\Scripts\deactivate.bat 2>NUL
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
+set end=%end:~4,10%
+call :timestamp "%start%" "%end%" "TestCases Total"
 echo Running unit tests failed, will exit!
 exit /b 8
 
@@ -268,7 +270,7 @@ goto:eof
 :test_inference_error
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 echo Testing fluid library for inference failed!
-exit /b 5
+exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
 :check_change_of_unittest
@@ -399,7 +401,7 @@ taskkill /f /im git-remote-https.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
-taskkill /f /im op_function_generator.exe  2>NUL
+wmic process where name="op_function_generator.exe" call terminate 2>NUL
 taskkill /f /im python.exe  2>NUL
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 taskkill /f /im python.exe  2>NUL
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 926747ef6186e3b9439baf787572fe9d1988fb46..9c1c95f37ed0785bfd770e7cbc02002daba8447b 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -273,7 +273,7 @@ function cmake_gen() {
 function abort(){
     echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
     echo "Please use pre-commit to check what is wrong." 1>&2
-    exit 1
+    exit 4
 }
 
 function check_style() {
@@ -303,7 +303,7 @@ function check_style() {
     
     if [ $commit_files == 'off' ];then
         echo "code format error"
-        exit 1
+        exit 4
     fi
     trap : 0
 }
@@ -528,6 +528,7 @@ EOF
         elif [ "$1" == "cp37-cp37m" ]; then
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
+        set +e
         ut_startTime_s=`date +%s`
         ctest --output-on-failure -j $2;mactest_error=$?
         ut_endTime_s=`date +%s`
@@ -959,7 +960,7 @@ set +x
                     
                     retry_unittests_record="$retry_unittests_record$failed_test_lists"
                     failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
-                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(\w+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
                     echo "========================================="
                     echo "This is the ${exec_time_array[$exec_times]} time to re-run"
                     echo "========================================="
@@ -1395,24 +1396,49 @@ function example() {
     fi
 }
 
+function summary_check_problems() {
+    set +x
+    local check_style_code=$1
+    local example_code=$2
+    if [ $check_style_code -ne 0 -o $example_code -ne 0 ];then
+      echo "========================================"
+      echo "summary problems:"
+      echo "========================================"
+      if [ $check_style_code -ne 0 ];then
+        echo "- Check code style failed! Please check the log and fix problems."
+      fi
+      if [ $example_code -ne 0 ];then
+        echo "- Check example code failed! Please check the log and fix problems."
+      fi
+      [ $check_style_code -ne 0 ] && exit $check_style_code
+      [ $example_code -ne 0 ] && exit $example_code
+    fi
+    set -x
+}
+
 function main() {
     local CMD=$1 
     local parallel_number=$2
     init
     if [ "$CMD" != "assert_file_approvals" ];then
       python ${PADDLE_ROOT}/tools/summary_env.py
+      bash ${PADDLE_ROOT}/tools/get_cpu_info.sh
     fi
     case $CMD in
       build_only)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         ;;
       build_and_check)
-        check_style
+        set +e
+        $(check_style >&2)
+        check_style_code=$?
         generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         check_sequence_op_unittest
         generate_api_spec ${PYTHON_ABI:-""} "PR"
-        example
+        $(example >&2)
+        example_code=$?
+        summary_check_problems $check_style_code $example_code
         assert_api_spec_approvals
         ;;
       build)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index f98c8f5ee6643f09fcfde21b24b5b9bea510129b..d5793eb424ab794e3e8af8ef2312aac927c272e5 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -217,6 +217,8 @@ from .tensor.search import index_select  #DEFINE_ALIAS
 from .tensor.search import nonzero  #DEFINE_ALIAS
 from .tensor.search import sort  #DEFINE_ALIAS
 from .framework.random import manual_seed  #DEFINE_ALIAS
+from .framework.random import get_cuda_rng_state  #DEFINE_ALIAS
+from .framework.random import set_cuda_rng_state  #DEFINE_ALIAS
 from .framework import Variable  #DEFINE_ALIAS
 from .framework import ParamAttr  #DEFINE_ALIAS
 from .framework import create_global_var  #DEFINE_ALIAS
@@ -230,6 +232,7 @@ from .framework import grad  #DEFINE_ALIAS
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
+from .framework import SaveLoadConfig  #DEFINE_ALIAS
 from .framework import DataParallel  #DEFINE_ALIAS
 
 from .framework import NoamDecay  #DEFINE_ALIAS
@@ -259,7 +262,7 @@ from .device import get_device
 from .fluid.dygraph.base import enable_dygraph as disable_static  #DEFINE_ALIAS
 from .fluid.dygraph.base import disable_dygraph as enable_static  #DEFINE_ALIAS
 from .fluid.framework import in_dygraph_mode as in_dynamic_mode  #DEFINE_ALIAS
-from .fluid.dygraph.base import no_grad  #DEFINE_ALIAS
+from .fluid.dygraph.base import no_grad_ as no_grad  #DEFINE_ALIAS
 
 from . import jit
 from . import static
@@ -267,5 +270,6 @@ from . import static
 # high-level api
 from .hapi import Model
 from .hapi import callbacks
+from .hapi import summary
 import paddle.text
 import paddle.vision
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index c40ae7179395a2fc16ece0d68546221ce53c2180..19df0ca91e103a0865f648daa5c173c2691307de 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -73,20 +73,21 @@ def broadcast(tensor, src, group=0):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data = np.array([[4, 5, 6], [4, 5, 6]])
-        else:
-            np_data = np.array([[1, 2, 3], [1, 2, 3]])
-        data = paddle.to_tensor(np_data)
-        paddle.distributed.broadcast(data, 1)
-        out = data.numpy()
-        # [[1, 2, 3], [1, 2, 3]]
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data = np.array([[4, 5, 6], [4, 5, 6]])
+            else:
+                np_data = np.array([[1, 2, 3], [1, 2, 3]])
+            data = paddle.to_tensor(np_data)
+            paddle.distributed.broadcast(data, 1)
+            out = data.numpy()
+            # [[1, 2, 3], [1, 2, 3]]
     """
     if in_dygraph_mode():
         return core.ops.c_broadcast(tensor, tensor, 'root', src,
@@ -129,21 +130,22 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
     Examples:
         .. code-block:: python
 
-        import paddle
-        from paddle.distributed import ReduceOp
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data = np.array([[4, 5, 6], [4, 5, 6]])
-        else:
-            np_data = np.array([[1, 2, 3], [1, 2, 3]])
-        data = paddle.to_tensor(np_data)
-        paddle.distributed.all_reduce(data)
-        out = data.numpy()
-        # [[5, 7, 9], [5, 7, 9]]
+            import numpy as np
+            import paddle
+            from paddle.distributed import ReduceOp
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data = np.array([[4, 5, 6], [4, 5, 6]])
+            else:
+                np_data = np.array([[1, 2, 3], [1, 2, 3]])
+            data = paddle.to_tensor(np_data)
+            paddle.distributed.all_reduce(data)
+            out = data.numpy()
+            # [[5, 7, 9], [5, 7, 9]]
     """
     if in_dygraph_mode():
         if op == ReduceOp.SUM:
@@ -204,20 +206,21 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data = np.array([[4, 5, 6], [4, 5, 6]])
-        else:
-            np_data = np.array([[1, 2, 3], [1, 2, 3]])
-        data = paddle.to_tensor(np_data)
-        paddle.distributed.reduce(data, 0)
-        out = data.numpy()
-        # [[5, 7, 9], [5, 7, 9]]
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data = np.array([[4, 5, 6], [4, 5, 6]])
+            else:
+                np_data = np.array([[1, 2, 3], [1, 2, 3]])
+            data = paddle.to_tensor(np_data)
+            paddle.distributed.reduce(data, 0)
+            out = data.numpy()
+            # [[5, 7, 9], [5, 7, 9]]
     """
     if in_dygraph_mode():
         if op == ReduceOp.SUM:
@@ -286,25 +289,26 @@ def all_gather(tensor_list, tensor, group=0):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        tensor_list = []
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data1 = np.array([[4, 5, 6], [4, 5, 6]])
-            np_data2 = np.array([[4, 5, 6], [4, 5, 6]])
-            data1 = paddle.to_tensor(np_data1)
-            data2 = paddle.to_tensor(np_data2)
-            paddle.distributed.all_gather(tensor_list, data1)
-        else:
-            np_data1 = np.array([[1, 2, 3], [1, 2, 3]])
-            np_data2 = np.array([[1, 2, 3], [1, 2, 3]])
-            data1 = paddle.to_tensor(np_data1)
-            data2 = paddle.to_tensor(np_data2)
-            out = paddle.distributed.all_gather(tensor_list, data2)
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            tensor_list = []
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data1 = np.array([[4, 5, 6], [4, 5, 6]])
+                np_data2 = np.array([[4, 5, 6], [4, 5, 6]])
+                data1 = paddle.to_tensor(np_data1)
+                data2 = paddle.to_tensor(np_data2)
+                paddle.distributed.all_gather(tensor_list, data1)
+            else:
+                np_data1 = np.array([[1, 2, 3], [1, 2, 3]])
+                np_data2 = np.array([[1, 2, 3], [1, 2, 3]])
+                data1 = paddle.to_tensor(np_data1)
+                data2 = paddle.to_tensor(np_data2)
+                paddle.distributed.all_gather(tensor_list, data2)
     """
     op_type = 'c_allgather'
     helper = LayerHelper(op_type, **locals())
@@ -359,25 +363,26 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data1 = np.array([7, 8, 9])
-            np_data2 = np.array([10, 11, 12])
-        else:
-            np_data1 = np.array([1, 2, 3])
-            np_data2 = np.array([4, 5, 6])
-        data1 = paddle.to_tensor(np_data1)
-        data2 = paddle.to_tensor(np_data2)
-        if paddle.ParallelEnv().local_rank == 0:
-            paddle.distributed.scatter(data1, src=1)
-        else:
-            paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
-        out = data1.numpy()
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data1 = np.array([7, 8, 9])
+                np_data2 = np.array([10, 11, 12])
+            else:
+                np_data1 = np.array([1, 2, 3])
+                np_data2 = np.array([4, 5, 6])
+            data1 = paddle.to_tensor(np_data1)
+            data2 = paddle.to_tensor(np_data2)
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                paddle.distributed.scatter(data1, src=1)
+            else:
+                paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
+            out = data1.numpy()
     """
     op_type = 'c_scatter'
     global _default_group
@@ -425,13 +430,13 @@ def barrier(group=0):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.prepare_context as prepare_context
+            import paddle
+            from paddle.distributed import init_parallel_env
 
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        paddle.distributed.barrier()
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            paddle.distributed.barrier()
     """
     op_type = 'barrier'
     temp = paddle.fill_constant([1], dtype="int32", value="1")
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 42ac68ba1a64de54f029878ceab08435c924d087..5f0cf9f93d62eba9b81e8a834b52f84122f2702d 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -50,3 +50,10 @@ distributed_optimizer = fleet.distributed_optimizer
 save_inference_model = fleet.save_inference_model
 save_persistables = fleet.save_persistables
 minimize = fleet.minimize
+distributed_model = fleet.distributed_model
+step = fleet.step
+clear_grad = fleet.clear_grad
+set_lr = fleet.set_lr
+get_lr = fleet.get_lr
+state_dict = fleet.state_dict
+set_state_dict = fleet.set_state_dict
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 26063d1b8a9225aff63628bb37f433ec95257dc7..9c1793fd5b56eb728ae7d16840cf4fb09cf975c8 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -118,7 +118,7 @@ class DistributedStrategy(object):
             strategy = fleet.DistributedStrategy()
             strategy.dgc = True
             strategy.recompute = True
-            strategy.recompute_configs = {"checkpoint": ["x"]}
+            strategy.recompute_configs = {"checkpoints": ["x"]}
             strategy.save_to_prototxt("dist_strategy.prototxt")
         """
         with open(output, "w") as fout:
@@ -133,7 +133,7 @@ class DistributedStrategy(object):
 
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
-            strategy.load_from_prototxt("dist_strategy.protoxt")
+            strategy.load_from_prototxt("dist_strategy.prototxt")
         """
         with open(pb_file, 'r') as f:
             self.strategy = google.protobuf.text_format.Merge(
@@ -147,6 +147,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
+            import paddle
             exe_strategy = paddle.fluid.ExecutionStrategy()
             exe_strategy.num_threads = 10
             exe_strategy.num_iteration_per_drop_scope = 10
@@ -179,6 +180,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
+            import paddle
             build_strategy = paddle.fluid.BuildStrategy()
             build_strategy.enable_sequential_execution = True
             build_strategy.fuse_elewise_add_act_ops = True
@@ -252,14 +254,19 @@ class DistributedStrategy(object):
         a dict.
 
         **Notes**:
-            **Detailed arguments for a_sync_configs**
-            **k_step**: number of local optimization updates before communication
-            **max_merge_var_num**: maximum number of merged gradients before communication
-            **send_queue_size**: a buffer size of worker communication
-            **independent_recv_thread**: if we are using independent recv thread for communication
-            **thread_pool_size**: number of thread pool
-            **send_wait_times**: waiting time for sending gradients
-            **runtime_split_send_recv**: if we are using Tensor split for send and recv during runtime
+            k_step(int): number of local optimization updates before communication
+
+            max_merge_var_num(int): maximum number of merged gradients before communication
+
+            send_queue_size(int): a buffer size of worker communication
+
+            independent_recv_thread(bool): if we are using independent recv thread for communication
+
+            thread_pool_size(int): number of thread pool
+
+            send_wait_times(int): waiting time for sending gradients
+
+            runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime
 
         Examples:
           .. code-block:: python
@@ -270,11 +277,12 @@ class DistributedStrategy(object):
 
             strategy = fleet.DistributedStrategy()
             strategy.a_sync = True  # by default this is True
-            configs = {"k_step": 10000, "send_queue_size": 32}
+            configs = {"k_steps": 1024, "send_queue_size": 32}
             strategy.a_sync_configs = configs
 
             # code block for defining loss and local optimizer
             # sgd = fleet.distributed_optimizer(optimizer, strategy)
+
         """
         return get_msg_dict(self.strategy.a_sync_configs)
 
@@ -314,14 +322,21 @@ class DistributedStrategy(object):
         settings that can be configured through a dict.
 
         **Notes**:
-            **init_loss_scaling(float)**: The initial loss scaling factor. Default 32768.
-            **use_dynamic_loss_scaling(bool)**: Whether to use dynamic loss scaling. Default True.
-            **incr_every_n_steps(int)**: Increases loss scaling every n consecutive steps with finite gradients. Default 1000.
-            **decr_every_n_nan_or_inf(int)**: Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.
-            **incr_ratio(float)**: The multiplier to use when increasing the loss scaling. Default 2.0.
-            **decr_ratio(float)**: The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.
-            **custom_white_list(list[str])**: Users' custom white list which always execution fp16.
-            **custom_black_list(list[str])**: Users' custom black list which forbidden execution fp16.
+            init_loss_scaling(float): The initial loss scaling factor. Default 32768.
+
+            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. Default True.
+
+            incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients. Default 1000.
+
+            decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.
+
+            incr_ratio(float): The multiplier to use when increasing the loss scaling. Default 2.0.
+
+            decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.
+
+            custom_white_list(list[str]): Users' custom white list which always execution fp16.
+
+            custom_black_list(list[str]): Users' custom black list which forbidden execution fp16.
 
         Examples:
           .. code-block:: python
@@ -553,7 +568,7 @@ class DistributedStrategy(object):
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.recompute = True
-            strategy.recompute_configs = {"checkpionts": ["x", "y"]}
+            strategy.recompute_configs = {"checkpoints": ["x", "y"]}
 
         """
         return get_msg_dict(self.strategy.recompute_configs)
@@ -603,6 +618,7 @@ class DistributedStrategy(object):
 
         **Notes**:
             **Detailed arguments for pipeline_configs**
+
             **micro_batch**: the number of small batches in each user defined batch
 
         Examples:
@@ -626,10 +642,10 @@ class DistributedStrategy(object):
     @property
     def localsgd(self):
         """
-        Indicating whether we are using Local SGD training. For more details, please refer to
-        [Don't Use Large Mini-Batches, Use Local SGD](https://arxiv.org/pdf/1808.07217.pdf),
+        Indicating whether we are using Local SGD training. Default Value: False
+        For more details, please refer to
+        `Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_.
 
-        Default Value: False
 
         Examples:
           .. code-block:: python
@@ -655,13 +671,12 @@ class DistributedStrategy(object):
         setting that can be configured through a dict.
 
         **Notes**:
-            **k_steps(int)**: The local steps for training before parameter
-                synchronization. Default 1. If strategy.auto is set True, the
-                local steps will be calculated automatically during training.
-                The algorithm is referenced in this paper: 
-                [Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD](https://arxiv.org/pdf/1810.08313.pdf).
-                In this case, k_steps indicates the first local steps which
-                is suggested setting to 1.
+            k_steps(int) The local steps for training before parameter synchronization. Default 1.
+
+            If strategy.auto is set True, the local steps will be calculated automatically during training.
+            The algorithm is referenced in this paper: 
+            `Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
+            In this case, k_steps indicates the first local steps which is suggested setting to 1.
 
         Examples:
           .. code-block:: python
@@ -712,14 +727,16 @@ class DistributedStrategy(object):
         settings that can be configured through a dict.
 
         **Notes**:
-            **rampup_begin_step(int)**: The beginning step from which gradient compression is implemented. Default 0.
-            **rampup_step(int)**: Time steps used in sparsity warm-up periods. Default is 1.
-                For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100,
-                it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array
-                ends, it will use 0.999 then and after.
-            **sparsity(list[float])**: Get top important element from gradient tensor, the ratio is (1 - sparsity).
-                Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important
-                element will be transmitted.
+            rampup_begin_step(int): The beginning step from which gradient compression is implemented. Default 0.
+
+            rampup_step(int): Time steps used in sparsity warm-up periods. Default is 1. \
+                    For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100, \
+                    it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array \
+                    ends, it will use 0.999 then and after.
+
+            sparsity(list[float]): Get top important element from gradient tensor, the ratio is (1 - sparsity). \
+                    Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important \
+                    element will be transmitted.
 
         Examples:
           .. code-block:: python
@@ -749,7 +766,8 @@ class DistributedStrategy(object):
         to model parameters.
 
         Examples:
-        .. code-block:: python
+          .. code-block:: python
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.gradient_merge = True
@@ -768,11 +786,15 @@ class DistributedStrategy(object):
     def gradient_merge_configs(self):
         """
         the key-value configs of distribute_strategy
-        Keys: 
-            k_steps (int): the update period of the parameters
-            avg (bool): whether to average the gradients of each mini-batch,
-                the default value is `True`
-        Example:
+
+        **Note**:
+            k_steps(int): the update period of the parameters.
+
+            avg(bool): whether to average the gradients of each mini-batch, the default value is `True`
+
+        Examples:
+          .. code-block:: python
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.gradient_merge = True
@@ -826,6 +848,7 @@ class DistributedStrategy(object):
 
         Examples:
           .. code-block:: python
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.lars = True
@@ -882,6 +905,7 @@ class DistributedStrategy(object):
 
         Examples:
           .. code-block:: python
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.lamb = True
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 282ac29d6f9dafb4eb3b83471157464620326348..8c748060e630079af362759b1e4c1c0b09d58063 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 
 from __future__ import print_function
+import warnings
 import paddle
+from paddle.fluid.framework import dygraph_only
+from paddle.fluid import compiler
 from .role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker, RoleMakerBase
 from .strategy_compiler import StrategyCompiler
 from .distributed_strategy import DistributedStrategy
@@ -21,6 +24,7 @@ from .meta_optimizer_factory import MetaOptimizerFactory
 from .runtime_factory import RuntimeFactory
 from .util_factory import UtilFactory
 from paddle.fluid.wrapped_decorator import wrap_decorator
+from paddle.fluid.dygraph import parallel_helper
 
 
 def _inited_runtime_handler_(func):
@@ -35,7 +39,24 @@ def _inited_runtime_handler_(func):
     return __impl__
 
 
+def _is_non_distributed_check_(func):
+    def __impl__(*args, **kwargs):
+        cls = args[0]
+
+        if cls._role_maker is not None and cls._role_maker._is_non_distributed(
+        ) is True:
+            warnings.warn(
+                "%s() function doesn't work when use non_distributed fleet." %
+                (func.__name__))
+            return
+
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
 inited_runtime_handler = wrap_decorator(_inited_runtime_handler_)
+is_non_distributed_check = wrap_decorator(_is_non_distributed_check_)
 
 
 class Fleet(object):
@@ -159,6 +180,12 @@ class Fleet(object):
                     "`role_maker` should be subclass of `RoleMakerBase`, but got {}".
                     format(type(role_maker)))
         self.strategy_compiler = StrategyCompiler()
+        if paddle.fluid.framework.in_dygraph_mode():
+            if parallel_helper._is_parallel_ctx_initialized():
+                warnings.warn(
+                    "The dygraph parallel environment has been initialized.")
+            else:
+                paddle.distributed.init_parallel_env()
         return None
 
     def is_first_worker(self):
@@ -367,6 +394,7 @@ class Fleet(object):
         """
         self._role_maker.barrier_worker()
 
+    @is_non_distributed_check
     @inited_runtime_handler
     def init_worker(self):
         """
@@ -391,6 +419,7 @@ class Fleet(object):
         """
         self._runtime_handle._init_worker()
 
+    @is_non_distributed_check
     @inited_runtime_handler
     def init_server(self, *args, **kwargs):
         """
@@ -416,6 +445,7 @@ class Fleet(object):
         """
         self._runtime_handle._init_server(*args, **kwargs)
 
+    @is_non_distributed_check
     @inited_runtime_handler
     def run_server(self):
         """
@@ -440,6 +470,7 @@ class Fleet(object):
         """
         self._runtime_handle._run_server()
 
+    @is_non_distributed_check
     @inited_runtime_handler
     def stop_worker(self):
         """
@@ -564,12 +595,344 @@ class Fleet(object):
 
         """
         self.user_defined_optimizer = optimizer
+        if paddle.fluid.framework.in_dygraph_mode():
+            return self
+
         if strategy == None:
             strategy = DistributedStrategy()
         self.user_defined_strategy = strategy
         self.valid_strategy = None
         return self
 
+    @dygraph_only
+    def distributed_model(self, model):
+        """
+        Return dygraph distributed data parallel model (Layer)
+        Only work in dygraph mode
+
+        Examples:
+            .. code-block:: python
+            import paddle
+            import paddle.nn as nn
+            from paddle.distributed import fleet
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train():
+                # 1. enable dynamic mode
+                paddle.disable_static()
+
+                # 2. initialize fleet environment
+                fleet.init(is_collective=True)
+
+                # 3. create layer & optimizer
+                layer = LinearNet()
+                loss_fn = nn.MSELoss()
+                adam = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=layer.parameters())
+
+                # 4. get data_parallel model using fleet
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+
+                # 5. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+
+                print("loss:", loss.numpy())
+
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            if __name__ == '__main__':
+                paddle.distributed.spawn(train)
+        """
+        assert model is not None
+        self.model = paddle.DataParallel(model)
+        return self.model
+
+    @dygraph_only
+    def state_dict(self):
+        """
+        Get state dict information from optimizer.
+        Only work in dygraph mode
+
+        Returns: 
+            state_dict(dict) : dict contains all the Tensor used by optimizer
+
+        Examples:
+            .. code-block:: python
+            import numpy as np
+            import paddle
+            from paddle.distributed import fleet
+
+            paddle.disable_static()
+            fleet.init(is_collective=True)
+
+            value = np.arange(26).reshape(2, 13).astype("float32")
+            a = paddle.fluid.dygraph.to_variable(value)
+
+            layer = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+
+            adam = fleet.distributed_optimizer(adam)
+            dp_layer = fleet.distributed_model(layer)
+            state_dict = adam.state_dict()
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.state_dict()
+
+    @dygraph_only
+    def set_state_dict(self, state_dict):
+        """
+        Load optimizer state dict.
+        Only work in dygraph mode
+
+        Args: 
+            state_dict(dict) : Dict contains all the Tensor needed by optimizer
+
+        Returns: None 
+
+        Examples:
+            .. code-block:: python
+            import numpy as np
+            import paddle
+            from paddle.distributed import fleet
+
+            paddle.disable_static()
+            fleet.init(is_collective=True)
+
+            value = np.arange(26).reshape(2, 13).astype("float32")
+            a = paddle.fluid.dygraph.to_variable(value)
+
+            layer = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+
+            adam = fleet.distributed_optimizer(adam)
+            dp_layer = fleet.distributed_model(layer)
+            state_dict = adam.state_dict()
+            paddle.framework.save(state_dict, "paddle_dy")
+            para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
+            adam.set_state_dict(opti_state_dict)
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.set_state_dict(state_dict)
+
+    @dygraph_only
+    def set_lr(self, value):
+        """
+        Set the value of the learning rate manually in the optimizer. 
+        Only work in dygraph mode
+ 
+        Args:
+            value (float|Tensor): the value of learning rate
+
+        Returns: None 
+
+        Examples:
+            .. code-block:: python
+            import numpy as np
+            import paddle
+            from paddle.distributed import fleet
+
+            paddle.disable_static()
+            fleet.init(is_collective=True)
+
+            value = np.arange(26).reshape(2, 13).astype("float32")
+            a = paddle.fluid.dygraph.to_variable(value)
+
+            layer = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+
+            adam = fleet.distributed_optimizer(adam)
+            dp_layer = fleet.distributed_model(layer)
+
+            lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+            for i in range(5):
+                adam.set_lr(lr_list[i])
+                lr = adam.get_lr()
+                print("current lr is {}".format(lr))
+            # Print:
+            #    current lr is 0.2
+            #    current lr is 0.3
+            #    current lr is 0.4
+            #    current lr is 0.5
+            #    current lr is 0.6
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.set_lr(value)
+
+    @dygraph_only
+    def get_lr(self):
+        """
+        Get current step learning rate.
+        Only work in dygraph mode
+
+        Returns:
+            float: The learning rate of the current step.
+
+        Examples:
+            .. code-block:: python
+            import numpy as np
+            import paddle
+            from paddle.distributed import fleet
+
+            paddle.disable_static()
+            fleet.init(is_collective=True)
+
+            value = np.arange(26).reshape(2, 13).astype("float32")
+            a = paddle.fluid.dygraph.to_variable(value)
+
+            layer = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+
+            adam = fleet.distributed_optimizer(adam)
+            dp_layer = fleet.distributed_model(layer)
+
+            lr = adam.get_lr()
+            print(lr) # 0.01
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.get_lr()
+
+    @dygraph_only
+    def step(self):
+        """
+        Execute the optimizer once.
+        Only work in dygraph mode
+
+        Returns: None
+
+        Examples:
+            .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            from paddle.distributed import fleet
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train():
+                # 1. enable dynamic mode
+                paddle.disable_static()
+
+                # 2. initialize fleet environment
+                fleet.init(is_collective=True)
+
+                # 3. create layer & optimizer
+                layer = LinearNet()
+                loss_fn = nn.MSELoss()
+                adam = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=layer.parameters())
+
+                # 4. get data_parallel model using fleet
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+
+                # 5. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+
+                print("loss:", loss.numpy())
+
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            if __name__ == '__main__':
+                paddle.distributed.spawn(train)
+
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.step()
+
+    @dygraph_only
+    def clear_grad(self):
+        """
+        Execute the optimizer once.
+        Only work in dygraph mode
+ 
+        Returns: None
+
+        Examples:
+            .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            from paddle.distributed import fleet
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train():
+                # 1. enable dynamic mode
+                paddle.disable_static()
+
+                # 2. initialize fleet environment
+                fleet.init(is_collective=True)
+
+                # 3. create layer & optimizer
+                layer = LinearNet()
+                loss_fn = nn.MSELoss()
+                adam = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=layer.parameters())
+
+                # 4. get data_parallel model using fleet
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+
+                # 5. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+
+                print("loss:", loss.numpy())
+
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            if __name__ == '__main__':
+                paddle.distributed.spawn(train)
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.clear_grad()
+
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -593,8 +956,8 @@ class Fleet(object):
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) variable pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
-            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
@@ -619,6 +982,11 @@ class Fleet(object):
                 # for more examples, please reference https://github.com/PaddlePaddle/FleetX
 
         """
+        if paddle.fluid.framework.in_dygraph_mode():
+            # imitate target optimizer retrieval
+            target_opt = self.user_defined_optimizer
+            return target_opt.minimize(loss)
+
         context = {}
         # cache original feed forward program
         self.origin_main_program = loss.block.program
@@ -672,6 +1040,20 @@ class Fleet(object):
         optimize_ops = []
         params_grads = []
 
+        if self._role_maker._is_non_distributed() and not self._is_collective:
+            if self._runtime_handle is None:
+                self._runtime_handle = RuntimeFactory()._create_runtime(context)
+
+            compiled_program = compiler.CompiledProgram(
+                self.origin_main_program).with_data_parallel(
+                    loss_name=loss.name, share_vars_from=None)
+            loss.block.program._graph = compiled_program
+            return self.user_defined_optimizer.minimize(
+                loss,
+                startup_program=startup_program,
+                parameter_list=parameter_list,
+                no_grad_set=no_grad_set)
+
         if meta_optimizer:
             optimize_ops, params_grads = meta_optimizer.minimize(
                 loss,
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 25f2d0dd3f45855d9f337c6b7154db9cb5bbae45..8614b1861343b8e48b55a8e75d9e432ef6329184 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -232,6 +232,8 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         self._node_type_comm = None
         self._all_comm = None
 
+        self._non_distributed = False
+
         if not self._is_collective:
             self._hdfs_name = kwargs.get("hdfs_name", "")
             self._hdfs_ugi = kwargs.get("hdfs_ugi", "")
@@ -373,6 +375,15 @@ class PaddleCloudRoleMaker(RoleMakerBase):
             self.generate_role()
         return self._server_endpoints
 
+    def _is_non_distributed(self):
+        """
+        Return True if indispensable environment for fleetrun is not found
+        (use python-run to launch fleet-code directly)
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._non_distributed
+
     def _heter_worker_num(self):
         """
         get heter worker nums
@@ -409,13 +420,22 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         try:
             # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
             # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
-            self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST",
-                                               "").split(",")
-            assert self._server_endpoints != ""
+            self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST")
             self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                                "").split(",")
-            assert self._server_endpoints != ""
-
+            if self._server_endpoints is None:
+                # back to non_distributed execution.
+                self._server_endpoints = ""
+                self._trainers_num = 1
+                self._role = Role.WORKER
+                self._current_id = 0
+                self._node_num = 1
+                self._heter_trainers_num = 0
+                self._heter_trainer_endpoints = None
+                self._non_distributed = True
+                return
+
+            self._server_endpoints = self._server_endpoints.split(",")
             trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
             training_role = os.environ["TRAINING_ROLE"]
 
@@ -488,7 +508,11 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         assert (self._training_role == "TRAINER")
         self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
         self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
-        assert self._worker_endpoints is not None, "can't find PADDLE_TRAINER_ENDPOINTS"
+        if self._worker_endpoints is None:
+            # back to non_distributed execution.
+            self._worker_endpoints = "127.0.0.1:6170"
+            self._cur_endpoint = self._worker_endpoints
+            self._non_distributed = True
         self._worker_endpoints = self._worker_endpoints.split(",")
         self._trainers_num = len(self._worker_endpoints)
         self._node_num = len(
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 29a1bda92f17443e6c38b070379481aaa419b1d4..7778acaf83b310cfa9a04059ce6d3be2d5326089 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -200,11 +200,11 @@ def launch_collective(args):
         start_port = os.environ.get('FLAGS_START_PORT')
     if cloud_utils.use_paddlecloud() and trainers_num != 1:
         cluster, pod = cloud_utils.get_cloud_cluster(args.ips, gpus, start_port)
-        logger.info("get cluster from cloud:{}".format(cluster))
+        logger.debug("get cluster from cloud:{}".format(cluster))
     else:
         # trainers_num = 1 or not use paddlecloud ips="a,b"
         cluster, pod = get_cluster_from_args(args, gpus)
-        logger.info("get cluster from args:{}".format(cluster))
+        logger.debug("get cluster from args:{}".format(cluster))
 
     procs = start_local_trainers(
         cluster,
@@ -217,7 +217,8 @@ def launch_collective(args):
         alive = watch_local_trainers(procs, cluster.trainers_nranks())
 
         if not alive:
-            logger.info("Local procs complete, POD info:{}".format(pod))
+            logger.info("Local processes completed.")
+            logger.debug("POD info:{}".format(pod))
             break
 
         time.sleep(3)
@@ -313,18 +314,26 @@ def launch_ps(args):
     cmds = []
     log_fns = []
     for idx, cur_server in enumerate(pod.servers):
-        current_env.update({
+        proc_env = {
             "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
             "PADDLE_PORT": cur_server.endpoint.split(":")[1],
             "TRAINING_ROLE": "PSERVER",
             "PADDLE_TRAINERS_NUM": str(worker_num),
             "POD_IP": cur_server.endpoint.split(":")[0]
-        })
+        }
+        current_env.update(proc_env)
 
         cmd = [sys.executable, "-u", args.training_script
                ] + args.training_script_args
         cmds.append(cmd)
 
+        if idx == 0:
+            logger.info(
+                "Local server start {} processes. First process distributed "
+                "environment info (Only For Debug): {}".format(
+                    len(pod.servers),
+                    pretty_print_envs(proc_env, ("Distributed Envs", "Value"))))
+
         if args.log_dir is not None:
             os.system("mkdir -p {}".format(args.log_dir))
             fn = open("%s/serverlog.%d" % (args.log_dir, idx), "w")
@@ -338,21 +347,32 @@ def launch_ps(args):
         tp.rank = cur_server.rank
         tp.local_rank = idx
         tp.log_fn = fn
-        tp.log_offset = 0 if fn else None
+        tp.log_offset = fn.tell() if fn else None
         tp.cmd = cmd
 
         procs.append(tp)
 
     for idx, cur_worker in enumerate(pod.workers):
-        current_env.update({
+        proc_env = {
             "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
+            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
             "PADDLE_TRAINERS_NUM": str(worker_num),
             "TRAINING_ROLE": "TRAINER",
             "PADDLE_TRAINER_ID": str(cur_worker.rank)
-        })
+        }
+        current_env.update(proc_env)
+
         cmd = [sys.executable, "-u", args.training_script
                ] + args.training_script_args
         cmds.append(cmd)
+
+        if idx == 0:
+            logger.info(
+                "Local worker start {} processes. First process distributed "
+                "environment info (Only For Debug): {}".format(
+                    len(pod.workers),
+                    pretty_print_envs(proc_env, ("Distributed Envs", "Value"))))
+
         if args.log_dir is not None:
             os.system("mkdir -p {}".format(args.log_dir))
             fn = open("%s/workerlog.%d" % (args.log_dir, idx), "w")
@@ -366,11 +386,14 @@ def launch_ps(args):
         tp.rank = cur_worker.rank
         tp.local_rank = idx
         tp.log_fn = fn
-        tp.log_offset = 0 if fn else None
+        tp.log_offset = fn.tell() if fn else None
         tp.cmd = cmd
 
         procs.append(tp)
 
+    logger.info(
+        "Please check servers and workers logs in {}/workerlog.* and {}/serverlog.*".
+        format(args.log_dir, args.log_dir))
     # only wait worker to finish here
     for i, proc in enumerate(procs):
         if i < len(pod.servers):
@@ -403,16 +426,16 @@ def launch():
     cuda_device_num = fluid.core.get_cuda_device_count()
     if len(has_ps_args) > 0 or cuda_device_num == 0:
         logger.info(
-            "Run parameter-sever cpu mode. pserver args:{}, cuda count:{}".
+            "Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}".
             format(has_ps_args, cuda_device_num))
         launch_ps(args)
     elif len(has_collective_args) > 0:
-        logger.info("Run collective gpu mode. gpu args:{}, cuda count:{}".
+        logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}".
                     format(has_collective_args, cuda_device_num))
         launch_collective(args)
     else:
         logger.warning(
-            "Not found distinct args. Default use gpu collective mode")
+            "Not found distinct arguments. Default use gpu collective mode")
         launch_collective(args)
 
 
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 350d8ae2b44db3e8f8e6b00d95c2b7a9ca91f88b..3da5aed8201ace6ccf9eed1ff322a7c6304de4a6 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -253,7 +253,8 @@ def terminate_local_procs(procs):
     for p in procs:
         if p.proc.poll() is None:
             p.proc.terminate()
-            p.log_fn.close()
+            if p.log_fn:
+                p.log_fn.close()
             logger.debug("terminate process id:{}".format(p.proc.pid))
 
     #wait all process terminiated
@@ -338,6 +339,45 @@ def get_ports(num, offset):
     return ports
 
 
+def pretty_print_envs(envs, header=None):
+    spacing = 2
+    max_k = 40
+    max_v = 45
+
+    for k, v in envs.items():
+        max_k = max(max_k, len(k))
+
+    h_format = "{{:^{}s}}{}{{:<{}s}}\n".format(max_k, " " * spacing, max_v)
+    l_format = "{{:<{}s}}{{}}{{:<{}s}}\n".format(max_k, max_v)
+    length = max_k + max_v + spacing
+
+    border = "".join(["="] * length)
+    line = "".join(["-"] * length)
+
+    draws = ""
+    draws += border + "\n"
+
+    if header:
+        draws += h_format.format(header[0], header[1])
+    else:
+        draws += h_format.format("fleetrun Distributed Envs", "Value")
+
+    draws += line + "\n"
+
+    for k, v in envs.items():
+        if isinstance(v, str) and len(v) >= max_v:
+            str_v = "... " + v[-41:]
+        else:
+            str_v = v
+
+        draws += l_format.format(k, " " * spacing, str(str_v))
+
+    draws += border
+
+    _str = "\n{}\n".format(draws)
+    return _str
+
+
 class TrainerProc(object):
     def __init__(self):
         self.proc = None
@@ -373,11 +413,19 @@ def start_local_trainers(cluster,
 
         current_env.update(proc_env)
 
-        logger.debug("trainer proc env:{}".format(current_env))
-
         cmd = [sys.executable, "-u", training_script] + training_script_args
 
-        logger.info("start trainer proc:{} env:{}".format(cmd, proc_env))
+        logger.debug("start trainer proc{}  env:{}".format(cmd, current_env))
+
+        if idx == 0:
+            logger.info("Local start {} processes. First process distributed "
+                        "environment info (Only For Debug): {}".format(
+                            len(pod.trainers),
+                            pretty_print_envs(proc_env, ("Distributed Envs",
+                                                         "Value"))))
+            logger.info(
+                "More details for debug about commands and environments are written in {}/run.sh".
+                format(log_dir))
 
         fn = None
         if log_dir is not None:
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 1741f10ccb1c28bfe6abaa63e754568fa08e21ce..227f8f60210ee8a44ab9e87ed7b88337c79ac7f1 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -154,15 +154,16 @@ class ParameterServerRuntime(RuntimeBase):
             kwargs["sparse_attrs"] = get_sparse_attrs()
             return kwargs
 
-        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops, _has_global_step
 
         from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
             SyncStrategy, GeoStrategy
 
         trainer_config = self.async_strategy.get_trainer_runtime_config()
-        lrs = _get_lr_ops(self.origin_main_program)
 
-        if len(lrs) > 0:
+        lrs = _has_global_step(_get_lr_ops(self.origin_main_program))
+
+        if lrs:
             kwargs = {"need_global_step": "1"}
         else:
             kwargs = {"need_global_step": "0"}
@@ -201,6 +202,9 @@ class ParameterServerRuntime(RuntimeBase):
             if self.role_maker._get_heter_worker_device() == "GPU":
                 gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
                 executor = Executor(fluid.CUDAPlace(gpu_id))
+            elif self.role_maker._get_heter_worker_device() == "XPU":
+                xpu_id = int(os.getenv("FLAGS_selected_xpus", "0"))
+                executor = Executor(fluid.XPUPlace(xpu_id))
             else:
                 raise ValueError("Not Support Device {}".format(
                     self.role_maker._get_heter_worker_device()))
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index 49e98805d24f3f8f5dc1cfcbf3ddc8d9fb835fde..918ebce07825139fabe8ddd4c1e266dd04eb7f6d 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -102,21 +102,24 @@ class Distribution(object):
         tmp = 0.
 
         for arg in args:
-            valid_arg = False
-            for cls in [float, list, np.ndarray, tensor.Variable]:
-                if isinstance(arg, cls):
-                    valid_arg = True
-                    break
-            assert valid_arg, "type of input args must be float, list, numpy.ndarray or Tensor."
             if isinstance(arg, float):
-                arg = np.zeros(1) + arg
+                arg = [arg]
+            if not isinstance(arg, (list, np.ndarray, tensor.Variable)):
+                raise TypeError(
+                    "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}".
+                    format(type(arg)))
+
             arg_np = np.array(arg)
             arg_dtype = arg_np.dtype
-            if str(arg_dtype) not in ['float32']:
-                warnings.warn(
-                    "data type of argument only support float32, your argument will be convert to float32."
-                )
+            if str(arg_dtype) != 'float32':
+                if str(arg_dtype) != 'float64':
+                    # "assign" op doesn't support float64. if dtype is float64, float32 variable will be generated
+                    #  and converted to float64 later using "cast".
+                    warnings.warn(
+                        "data type of argument only support float32 and float64, your argument will be convert to float32."
+                    )
                 arg_np = arg_np.astype('float32')
+            # tmp is used to support broadcast, it summarizes shapes of all the args and get the mixed shape.
             tmp = tmp + arg_np
             numpy_args.append(arg_np)
 
@@ -129,6 +132,36 @@ class Distribution(object):
 
         return tuple(variable_args)
 
+    def _check_values_dtype_in_probs(self, param, value):
+        """
+        Log_prob and probs methods have input ``value``, if value's dtype is different from param,
+        convert value's dtype to be consistent with param's dtype.
+
+        Args:
+            param (int|float|list|numpy.ndarray|Tensor): low and high in Uniform class, loc and scale in Normal class.
+            value (Tensor): The input tensor.
+
+        Returns:
+            value (Tensor): Change value's dtype if value's dtype is different from param.
+        """
+        if in_dygraph_mode():
+            if value.dtype != param.dtype and convert_dtype(
+                    value.dtype) in ['float32', 'float64']:
+                warnings.warn(
+                    "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
+                )
+                return core.ops.cast(value, 'in_dtype', value.dtype,
+                                     'out_dtype', param.dtype)
+
+        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                 'log_prob')
+        if value.dtype != param.dtype:
+            warnings.warn(
+                "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
+            )
+            return tensor.cast(value, dtype=param.dtype)
+        return value
+
 
 class Uniform(Distribution):
     """Uniform distribution with `low` and `high` parameters.
@@ -155,8 +188,8 @@ class Uniform(Distribution):
     [broadcasting](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/beginners_guide/basic_concept/broadcasting_en.html) (e.g., `high - low` is a valid operation).
 
     Args:
-        low(int|float|list|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float32, list, numpy.ndarray or Tensor
-        high(int|float|list|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float32, list, numpy.ndarray or Tensor
+        low(int|float|list|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
+        high(int|float|list|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
@@ -206,6 +239,7 @@ class Uniform(Distribution):
         self.all_arg_is_float = False
         self.batch_size_unknown = False
         self.name = name if name is not None else 'Uniform'
+        self.dtype = 'float32'
 
         if isinstance(low, int):
             low = float(low)
@@ -216,10 +250,22 @@ class Uniform(Distribution):
             self.batch_size_unknown = True
             self.low = low
             self.high = high
+            self.dtype = convert_dtype(low.dtype)
         else:
             if isinstance(low, float) and isinstance(high, float):
                 self.all_arg_is_float = True
+            if isinstance(
+                    low,
+                    np.ndarray) and str(low.dtype) in ['float32', 'float64']:
+                self.dtype = low.dtype
+            elif isinstance(
+                    high,
+                    np.ndarray) and str(high.dtype) in ['float32', 'float64']:
+                self.dtype = high.dtype
             self.low, self.high = self._to_tensor(low, high)
+            if self.dtype != convert_dtype(self.low.dtype):
+                self.low = tensor.cast(self.low, dtype=self.dtype)
+                self.high = tensor.cast(self.high, dtype=self.dtype)
 
     def sample(self, shape, seed=0):
         """Generate samples of the specified shape.
@@ -241,11 +287,11 @@ class Uniform(Distribution):
         if self.batch_size_unknown:
             output_shape = shape + batch_shape
             zero_tmp = tensor.fill_constant_batch_size_like(
-                self.low + self.high, batch_shape + shape, self.low.dtype, 0.)
+                self.low + self.high, batch_shape + shape, self.dtype, 0.)
             uniform_random_tmp = nn.uniform_random_batch_size_like(
                 zero_tmp,
                 zero_tmp.shape,
-                dtype=convert_dtype(zero_tmp.dtype),
+                dtype=self.dtype,
                 min=0.,
                 max=1.,
                 seed=seed)
@@ -259,9 +305,8 @@ class Uniform(Distribution):
         else:
             output_shape = shape + batch_shape
             output = nn.uniform_random(
-                output_shape, seed=seed) * (tensor.zeros(
-                    output_shape, dtype=self.low.dtype) +
-                                            (self.high - self.low))
+                output_shape, seed=seed, dtype=self.dtype) * (tensor.zeros(
+                    output_shape, dtype=self.dtype) + (self.high - self.low))
             output = elementwise_add(output, self.low, name=name)
             if self.all_arg_is_float:
                 return nn.reshape(output, shape, name=name)
@@ -279,22 +324,20 @@ class Uniform(Distribution):
 
         """
         name = self.name + '_log_prob'
+        value = self._check_values_dtype_in_probs(self.low, value)
         if in_dygraph_mode():
+            # ensure value in [low, high]
             lb_bool = self.low < value
             ub_bool = value < self.high
 
-            dtype = value.dtype
             lb = core.ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
-                               dtype)
+                               value.dtype)
             ub = core.ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
-                               dtype)
+                               value.dtype)
             return nn.log(lb * ub) - nn.log(self.high - self.low)
 
-        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                 'log_prob')
-
-        lb_bool = control_flow.less_than(self.low, value)
-        ub_bool = control_flow.less_than(value, self.high)
+        lb_bool = self.low < value
+        ub_bool = value < self.high
         lb = tensor.cast(lb_bool, dtype=value.dtype)
         ub = tensor.cast(ub_bool, dtype=value.dtype)
         return elementwise_sub(
@@ -311,22 +354,19 @@ class Uniform(Distribution):
 
         """
         name = self.name + '_probs'
+        value = self._check_values_dtype_in_probs(self.low, value)
         if in_dygraph_mode():
             lb_bool = self.low < value
             ub_bool = value < self.high
 
-            dtype = value.dtype
             lb = core.ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
-                               dtype)
+                               value.dtype)
             ub = core.ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
-                               dtype)
+                               value.dtype)
             return (lb * ub) / (self.high - self.low)
 
-        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                 'log_prob')
-
-        lb_bool = control_flow.less_than(self.low, value)
-        ub_bool = control_flow.less_than(value, self.high)
+        lb_bool = self.low < value
+        ub_bool = value < self.high
         lb = tensor.cast(lb_bool, dtype=value.dtype)
         ub = tensor.cast(ub_bool, dtype=value.dtype)
         return elementwise_div((lb * ub), (self.high - self.low), name=name)
@@ -334,6 +374,12 @@ class Uniform(Distribution):
     def entropy(self):
         """Shannon entropy in nats.
 
+        The entropy is
+
+        .. math::
+
+            entropy(low, high) = \\log (high - low)
+
         Returns:
           Tensor: Shannon entropy of uniform distribution.The data type is float32.
 
@@ -364,8 +410,8 @@ class Normal(Distribution):
     * :math:`Z`: is the normalization constant.
 
     Args:
-        loc(int|float|list|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float32, list, numpy.ndarray or Tensor.
-        scale(int|float|list|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float32, list, numpy.ndarray or Tensor.
+        loc(int|float|list|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
+        scale(int|float|list|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
@@ -418,6 +464,7 @@ class Normal(Distribution):
         self.batch_size_unknown = False
         self.all_arg_is_float = False
         self.name = name if name is not None else 'Normal'
+        self.dtype = 'float32'
 
         if isinstance(loc, int):
             loc = float(loc)
@@ -428,10 +475,22 @@ class Normal(Distribution):
             self.batch_size_unknown = True
             self.loc = loc
             self.scale = scale
+            self.dtype = convert_dtype(loc.dtype)
         else:
             if isinstance(loc, float) and isinstance(scale, float):
                 self.all_arg_is_float = True
+            if isinstance(
+                    loc,
+                    np.ndarray) and str(loc.dtype) in ['float32', 'float64']:
+                self.dtype = loc.dtype
+            elif isinstance(
+                    scale,
+                    np.ndarray) and str(scale.dtype) in ['float32', 'float64']:
+                self.dtype = scale.dtype
             self.loc, self.scale = self._to_tensor(loc, scale)
+            if self.dtype != convert_dtype(self.loc.dtype):
+                self.loc = tensor.cast(self.loc, dtype=self.dtype)
+                self.scale = tensor.cast(self.scale, dtype=self.dtype)
 
     def sample(self, shape, seed=0):
         """Generate samples of the specified shape.
@@ -454,22 +513,18 @@ class Normal(Distribution):
         if self.batch_size_unknown:
             output_shape = shape + batch_shape
             zero_tmp = tensor.fill_constant_batch_size_like(
-                self.loc + self.scale, batch_shape + shape, self.loc.dtype, 0.)
+                self.loc + self.scale, batch_shape + shape, self.dtype, 0.)
             zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
             zero_tmp_shape = nn.shape(zero_tmp_reshape)
             normal_random_tmp = nn.gaussian_random(
-                zero_tmp_shape,
-                mean=0.,
-                std=1.,
-                seed=seed,
-                dtype=convert_dtype(self.loc.dtype))
+                zero_tmp_shape, mean=0., std=1., seed=seed, dtype=self.dtype)
             output = normal_random_tmp * (zero_tmp_reshape + self.scale)
             output = elementwise_add(output, self.loc, name=name)
             return output
         else:
             output_shape = shape + batch_shape
-            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed) * \
-                     (tensor.zeros(output_shape, dtype=self.loc.dtype) + self.scale)
+            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed, dtype=self.dtype) * \
+                     (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
             output = elementwise_add(output, self.loc, name=name)
             if self.all_arg_is_float:
                 return nn.reshape(output, shape, name=name)
@@ -479,6 +534,16 @@ class Normal(Distribution):
     def entropy(self):
         """Shannon entropy in nats.
 
+        The entropy is
+
+        .. math::
+
+            entropy(\sigma) = 0.5 \\log (2 \pi e \sigma^2)
+
+        In the above equation:
+
+        * :math:`scale = \sigma`: is the std.
+
         Returns:
           Tensor: Shannon entropy of normal distribution.The data type is float32.
 
@@ -486,7 +551,7 @@ class Normal(Distribution):
         name = self.name + '_entropy'
         batch_shape = list((self.loc + self.scale).shape)
         zero_tmp = tensor.fill_constant_batch_size_like(
-            self.loc + self.scale, batch_shape, self.loc.dtype, 0.)
+            self.loc + self.scale, batch_shape, self.dtype, 0.)
         return elementwise_add(
             0.5 + zero_tmp,
             0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
@@ -502,11 +567,9 @@ class Normal(Distribution):
           Tensor: log probability.The data type is same with value.
 
         """
-        if not in_dygraph_mode():
-            check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                     'log_prob')
-
         name = self.name + '_log_prob'
+        value = self._check_values_dtype_in_probs(self.loc, value)
+
         var = self.scale * self.scale
         log_scale = nn.log(self.scale)
         return elementwise_sub(
@@ -524,11 +587,9 @@ class Normal(Distribution):
           Tensor: probability.The data type is same with value.
 
         """
-        if not in_dygraph_mode():
-            check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                     'log_prob')
-
         name = self.name + '_probs'
+        value = self._check_values_dtype_in_probs(self.loc, value)
+
         var = self.scale * self.scale
         return elementwise_div(
             ops.exp(-1. * ((value - self.loc) * (value - self.loc)) /
@@ -538,6 +599,29 @@ class Normal(Distribution):
     def kl_divergence(self, other):
         """The KL-divergence between two normal distributions.
 
+        The probability density function (pdf) is
+
+        .. math::
+
+            KL\_divergence(\mu_0, \sigma_0; \mu_1, \sigma_1) = 0.5 (ratio^2 + (\\frac{diff}{\sigma_1})^2 - 1 - 2 \\ln {ratio})
+
+        .. math::
+
+            ratio = \\frac{\sigma_0}{\sigma_1}
+        
+        .. math::
+
+            diff = \mu_1 - \mu_0
+
+        In the above equation:
+
+        * :math:`loc = \mu_0`: is the mean of current Normal distribution.
+        * :math:`scale = \sigma_0`: is the std of current Normal distribution.
+        * :math:`loc = \mu_1`: is the mean of other Normal distribution.
+        * :math:`scale = \sigma_1`: is the std of other Normal distribution.
+        * :math:`ratio`: is the ratio of scales.
+        * :math:`diff`: is the difference between means.
+
         Args:
             other (Normal): instance of Normal.
 
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 5f6594a47213021c3a82dd4a0266f52240270e87..7b301ac19d1d3dc1f4aabb6cf3af2f0874faa677 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -129,7 +129,7 @@ class GradientClipBase(object):
     def __str__(self):
         raise NotImplementedError()
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def _dygraph_clip(self, params_grads):
         raise NotImplementedError
 
@@ -258,7 +258,7 @@ class GradientClipByValue(GradientClipBase):
     def __str__(self):
         return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max)
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def _dygraph_clip(self, params_grads):
         params_and_grads = []
         for p, g in params_grads:
@@ -413,7 +413,7 @@ class GradientClipByNorm(GradientClipBase):
     def __str__(self):
         return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def _dygraph_clip(self, params_grads):
         params_and_grads = []
         for p, g in params_grads:
@@ -565,7 +565,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
     def __str__(self):
         return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def _dygraph_clip(self, params_grads):
         params_and_grads = []
         sum_square_list = []
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 93013ef8bf8442311621202e0a86dd65e7c38b30..328dafe6219adb3c6355de0bafc430c52725024f 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -74,7 +74,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                 continue
         for in_var_name in op.input(in_name):
             in_var = block.var(in_var_name)
-            if in_var.type not in valid_types:
+            if in_var.type not in valid_types or in_var.dtype == dest_dtype:
                 continue
             if in_var.dtype == src_dtype:
                 cast_name = in_var.name + '.cast_' + _dtype_to_str(dest_dtype)
@@ -84,7 +84,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                         name=cast_name,
                         dtype=dest_dtype,
                         persistable=False,
-                        stop_gradient=False)
+                        stop_gradient=in_var.stop_gradient)
 
                     block._insert_op(
                         idx,
@@ -100,7 +100,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
             else:
                 if op.has_attr('in_dtype'):
                     op._set_attr('in_dtype', dest_dtype)
-    if src_dtype == core.VarDesc.VarType.FP32:
+    if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.FP16:
         for out_name in op.output_names:
             if op.type == 'batch_norm' and out_name != 'Y':
                 continue
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 007d701284dfc7ff2cafb128984414517579fce3..6ac005060e0b21d88f17619bbe88b7a56c23fdb8 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -270,7 +270,7 @@ foreach(src ${TEST_OPS})
 endforeach()
 
 # setting timeout value for old unittests
-if(NOT WIN32)
+if(NOT WIN32 AND NOT APPLE)
     set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250 LABELS "RUN_TYPE=NIGHTLY")
-	  set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
+	set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
 endif()
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 45aa85d4168a55e206460ce2e39292013caa9ce0..5da83da33b8de334d4ae1e5b072cfb20d74c1271 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -132,6 +132,28 @@ def check_dtype(input_dtype,
              extra_message))
 
 
+def check_shape(shape,
+                op_name,
+                expected_shape_type=(list, tuple, Variable),
+                expected_element_type=(int, Variable),
+                expected_tensor_dtype=('int32', 'int64')):
+    # See NOTE [ Why skip dynamic graph check ]
+    if in_dygraph_mode():
+        return
+    check_type(shape, 'shape', expected_shape_type, op_name)
+    if expected_element_type is not None and not isinstance(shape, Variable):
+        for item in shape:
+            check_type(item, 'element of shape', expected_element_type, op_name)
+            if expected_tensor_dtype is not None and isinstance(item, Variable):
+                check_dtype(
+                    item.dtype, 'element of shape', expected_tensor_dtype,
+                    op_name,
+                    'If element of shape is Tensor, its data type should be {}'.
+                    format(', '.join(expected_tensor_dtype)))
+    if expected_tensor_dtype is not None and isinstance(shape, Variable):
+        check_dtype(shape.dtype, 'shape', expected_tensor_dtype, op_name)
+
+
 class DataToLoDTensorConverter(object):
     def __init__(self, place, lod_level, shape, dtype):
         self.place = place
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 2174dbd31b8fb1ae97894699e03e25e809085cc8..c548bdfeba19510b26c0f80d356fa6a6b7bbaed7 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
-import inspect
 import decorator
 import contextlib
+import functools
+import inspect
 import sys
 import numpy as np
 from paddle.fluid import core
@@ -26,8 +27,8 @@ import objgraph
 from ..data_feeder import convert_dtype
 
 __all__ = [
-    'no_grad', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph', 'enabled',
-    'to_variable'
+    'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
+    'enabled', 'to_variable'
 ]
 
 
@@ -167,7 +168,80 @@ def disable_dygraph():
         _functional_dygraph_context_manager = None
 
 
-class no_grad:
+@signature_safe_contextmanager
+def _switch_tracer_mode_guard_(is_train=True):
+    tracer = framework._dygraph_tracer()
+    if tracer:
+        mode = tracer._train_mode
+        tracer._train_mode = is_train
+        try:
+            yield
+        finally:
+            tracer._train_mode = mode
+    else:
+        yield
+
+
+def no_grad(func=None):
+    """
+    :api_attr: imperative
+
+    Create a context which disables dygraph gradient calculation.
+    In this mode, the result of every computation will have `stop_gradient=True`.
+
+    Also functions as a decorator. (Make sure to instantiate without parenthesis.)
+
+    Examples:
+
+     .. code-block:: python
+
+        import numpy as np
+        import paddle.fluid as fluid
+
+        # use as generator
+
+        data = np.array([[2, 3], [4, 5]]).astype('float32')
+        with fluid.dygraph.guard():
+            l0 = fluid.Linear(2, 2)  # l0.weight.gradient() is None
+            l1 = fluid.Linear(2, 2)
+            with fluid.dygraph.no_grad():
+                # l1.weight.stop_gradient is False
+                tmp = l1.weight * 2  # tmp.stop_gradient is True
+            x = fluid.dygraph.to_variable(data)
+            y = l0(x) + tmp
+            o = l1(y)
+            o.backward()
+            print(tmp.gradient() is None)  # True
+            print(l0.weight.gradient() is None)  # False
+
+        # use as decorator
+
+        @fluid.dygraph.no_grad
+        def test_layer():
+            with fluid.dygraph.guard():
+                inp = np.ones([3, 1024], dtype='float32')
+                t = fluid.dygraph.base.to_variable(inp)
+                linear1 = fluid.Linear(1024, 4, bias_attr=False)
+                linear2 = fluid.Linear(4, 4)
+                ret = linear1(t)
+                dy_ret = linear2(ret)
+
+        test_layer()
+
+    """
+    if func is None:
+        return _switch_tracer_mode_guard_(is_train=False)
+    else:
+
+        @decorator.decorator
+        def __impl__(func, *args, **kwargs):
+            with _switch_tracer_mode_guard_(is_train=False):
+                return func(*args, **kwargs)
+
+        return __impl__(func)
+
+
+class no_grad_:
     """
     :api_attr: imperative
 
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 82018132cc8b8600958e5cd52df5844e3d37638e..f85b184f68111bbc0930b36e2ba6e05c2dbd006a 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -16,13 +16,16 @@ from __future__ import print_function
 
 import os
 import collections
+import functools
 from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer
 import pickle
 import six
 from . import learning_rate_scheduler
 import warnings
 from .. import core
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME, _load_persistable_vars
+from .base import guard
+from paddle.fluid.dygraph.jit import SaveLoadConfig
+from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 
 __all__ = [
     'save_dygraph',
@@ -30,6 +33,37 @@ __all__ = [
 ]
 
 
+# NOTE(chenweihang): deprecate load_dygraph's argument keep_name_table,
+# ensure compatibility when user still use keep_name_table argument
+def deprecate_keep_name_table(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        def __warn_and_build_configs__(keep_name_table):
+            warnings.warn(
+                "The argument `keep_name_table` has deprecated, please use `SaveLoadConfig.keep_name_table`.",
+                DeprecationWarning)
+            configs = SaveLoadConfig()
+            configs.keep_name_table = keep_name_table
+            return configs
+
+        # deal with arg `keep_name_table`
+        if len(args) > 1 and isinstance(args[1], bool):
+            args = list(args)
+            args[1] = __warn_and_build_configs__(args[1])
+        # deal with kwargs
+        elif 'keep_name_table' in kwargs:
+            kwargs['configs'] = __warn_and_build_configs__(kwargs[
+                'keep_name_table'])
+            kwargs.pop('keep_name_table')
+        else:
+            # do nothing
+            pass
+
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
 @dygraph_only
 def save_dygraph(state_dict, model_path):
     '''
@@ -100,17 +134,27 @@ def save_dygraph(state_dict, model_path):
 
 # TODO(qingqing01): remove dygraph_only to support loading static model.
 # maybe need to unify the loading interface after 2.0 API is ready.
-#@dygraph_only
-def load_dygraph(model_path, keep_name_table=False):
+# @dygraph_only
+@deprecate_keep_name_table
+def load_dygraph(model_path, configs=None):
     '''
     :api_attr: imperative
     
-    Load parameter state_dict from disk.
+    Load parameter state dict from disk.
+
+    .. note::
+        Due to some historical reasons, if you load ``state_dict`` from the saved 
+        result of `paddle.io.save_inference_model`, the structured variable name 
+        will cannot be restored. You need to set the argument `use_structured_name=False` 
+        when using `Layer.set_state_dict` later.
 
     Args:
-        model_path(str) : The file prefix store the state_dict. (The path should Not contain suffix '.pdparams') 
-        keep_name_table(bool, optional) : Whether keep structed name to parameter name conversion table in output dict. 
-                                          Default : False
+        model_path(str) : The file prefix store the state_dict. 
+            (The path should Not contain suffix '.pdparams') 
+        configs (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig`
+            object that specifies additional configuration options, these options 
+            are for compatibility with ``jit.save/io.save_inference_model`` formats. 
+            Default None.
 
     Returns:
         state_dict(dict) : the dict store the state_dict
@@ -118,23 +162,27 @@ def load_dygraph(model_path, keep_name_table=False):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             
-            with fluid.dygraph.guard():
-                emb = fluid.dygraph.Embedding([10, 10])
+            paddle.disable_static()
 
-                state_dict = emb.state_dict()
-                fluid.save_dygraph( state_dict, "paddle_dy")
+            emb = paddle.nn.Embedding([10, 10])
 
-                adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000),
-                                             parameter_list = emb.parameters() )
-                state_dict = adam.state_dict()
-                fluid.save_dygraph( state_dict, "paddle_dy")
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, "paddle_dy")
 
-                para_state_dict, opti_state_dict = fluid.load_dygraph( "paddle_dy")
+            scheduler = paddle.optimizer.lr_scheduler.NoamLR(
+                d_model=0.01, warmup_steps=100, verbose=True)
+            adam = paddle.optimizer.Adam(
+                learning_rate=scheduler,
+                parameters=emb.parameters())
+            state_dict = adam.state_dict()
+            paddle.save(state_dict, "paddle_dy")
 
-    '''
+            para_state_dict, opti_state_dict = paddle.load("paddle_dy")
 
+    '''
+    # deal with argument `model_path`
     model_prefix = model_path
     if model_prefix.endswith(".pdparams"):
         model_prefix = model_prefix[:-9]
@@ -145,74 +193,53 @@ def load_dygraph(model_path, keep_name_table=False):
     opti_dict = None
     params_file_path = model_prefix + ".pdparams"
     opti_file_path = model_prefix + ".pdopt"
+
+    # deal with argument `configs`
+    if configs is None:
+        configs = SaveLoadConfig()
+
     if not os.path.exists(params_file_path) and not os.path.exists(
             opti_file_path):
-        # Load state dict by `jit.save` save format
-        # TODO(chenweihang): [Why not support `io.save_infernece_model` save format here]
+        # Load state dict by `jit.save/io.save_inference_model` save format
+        # NOTE(chenweihang): [ Compatibility of save_inference_model save format ]
         # The model saved by `save_inference_model` does not completely correspond to 
         # the information required by the `state_dict` under the dygraph. 
-        # Although we reluctantly restore the `state_dict` in some scenarios, 
-        # this may not be complete and there are some limitations, so this function 
-        # will be considered later. The limitations include:
-        #   1. `save_inference_model` not save structured name, we need to remind 
-        # the user to configure the `use_structured_name` argument when `set_dict`, 
-        # but this argument is currently not public
-        #   2. if `save_inference_model` save all persistable variables in a single file,
-        # user need to give the variable name list to load `state_dict`
+        # `save_inference_model` not save structured name, we need to remind 
+        # the user to configure the `use_structured_name` argument when `set_state_dict`
+        # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
 
         # 1. check model path
         if not os.path.isdir(model_prefix):
             raise ValueError("Model saved directory '%s' is not exists." %
                              model_prefix)
-        # 2. load `__variables.info__`
-        var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME)
-        if not os.path.exists(var_info_path):
-            raise RuntimeError(
-                "No target can be loaded. Now only supports loading `state_dict` from "
-                "the result saved by `imperative.save` and `imperative.jit.save`."
-            )
-        with open(var_info_path, 'rb') as f:
-            extra_var_info = pickle.load(f)
-        # 3. load `__variables__`
-        # TODO(chenweihang): now only supports loading from default save format:
-        # - all persistable vars saved in one file named `__variables__`
-        # for other case, we may need to modify the arguments of this API
-        var_file_path = os.path.join(model_prefix, VARIABLE_FILENAME)
-        if not os.path.exists(var_file_path):
-            raise RuntimeError(
-                "The parameter file to be loaded was not found. "
-                "Now only supports loading from the default save format, "
-                "and does not support custom params_filename and "
-                "save parameters separately.")
-        # 4. load all persistable vars
-        load_var_list = []
-        for name in sorted(extra_var_info):
-            var = _varbase_creator(name=name, persistable=True)
-            load_var_list.append(var)
-        _dygraph_tracer().trace_op(
-            type='load_combine',
-            inputs={},
-            outputs={'Out': load_var_list},
-            attrs={'file_path': var_file_path})
-        # 5. construct state_dict
-        para_dict = dict()
-        for var in load_var_list:
-            structured_name = extra_var_info[var.name].get('structured_name',
-                                                           None)
-            if structured_name is None:
-                raise RuntimeError(
-                    "Cannot find saved variable (%s)'s structured name in saved model.",
-                    var.name)
-            para_dict[structured_name] = var.numpy()
-        # NOTE: `jit.save` doesn't save optimizer state
+
+        # 2. load program desc & construct _ProgramHolder
+        programs = _construct_program_holders(model_path,
+                                              configs.model_filename)
+
+        # 3. load layer parameters & buffers
+        # NOTE: using fluid.dygraph.guard() here will cause import error in py2
+        with guard():
+            persistable_var_dict = _construct_params_and_buffers(
+                model_prefix,
+                programs,
+                configs.separate_params,
+                configs.params_filename,
+                append_suffix=False)
+
+            # 4. construct state_dict
+            para_dict = dict()
+            for var_name in persistable_var_dict:
+                para_dict[var_name] = persistable_var_dict[var_name].numpy()
     else:
         # Load state dict by `save_dygraph` save format
+        para_dict = {}
         if os.path.exists(params_file_path):
             with open(params_file_path, 'rb') as f:
                 para_dict = pickle.load(f) if six.PY2 else pickle.load(
                     f, encoding='latin1')
 
-        if not keep_name_table and "StructuredToParameterName@@" in para_dict:
+        if not configs.keep_name_table and "StructuredToParameterName@@" in para_dict:
             del para_dict["StructuredToParameterName@@"]
 
         if os.path.exists(opti_file_path):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
index 82f39ffd080ec803beca4e60695204b707f48210..9334c15f7bcbc0ca3782be1d4f7fc6826a59bdbc 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
@@ -16,9 +16,7 @@ import astor
 import gast
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
-from paddle.fluid.dygraph.dygraph_to_static.utils import is_dygraph_api, is_to_variable
-from paddle.fluid.dygraph.dygraph_to_static.utils import to_assign_node, to_static_ast, update_args_of_func
-from paddle.fluid.dygraph.dygraph_to_static.utils import dygraph_class_to_static_api
+from paddle.fluid.dygraph.dygraph_to_static import utils
 
 
 class BasicApiTransformer(gast.NodeTransformer):
@@ -56,7 +54,7 @@ class BasicApiTransformer(gast.NodeTransformer):
             if isinstance(child_node, gast.Call):
                 # TODO(liym27):
                 #  Considers that a dygraph api which modifies the input or has a output.
-                if is_dygraph_api(child_node):
+                if utils.is_dygraph_api(child_node):
                     return
                 else:
                     self._visit_Call(child_node)
@@ -73,7 +71,7 @@ class BasicApiTransformer(gast.NodeTransformer):
 
         if self._is_dygraph_forward(func_name):
             class_node = self._get_class_node(func_name)
-            static_node = to_static_ast(node, class_node)
+            static_node = utils.to_static_ast(node, class_node)
             return static_node
         else:
             return node
@@ -91,14 +89,51 @@ class BasicApiTransformer(gast.NodeTransformer):
             if is_to_variable(node_value):
                 return False
 
-            if is_dygraph_api(node_value):
+            if utils.is_dygraph_api(node_value):
                 dygraph_api = node_value.func.attr
-                if not dygraph_class_to_static_api.get(dygraph_api):
+                if not utils.dygraph_class_to_static_api.get(dygraph_api):
                     return False
 
-                update_args_of_func(node_value, node_value, "__init__")
+                utils.update_args_of_func(node_value, node_value, "__init__")
                 target_str = astor.to_source(gast.gast_to_ast(node.targets[0]))
                 self.class_node_dict[target_str] = node_value
                 return True
             # TODO: node.value is not dygraph class
         return False
+
+
+def is_to_variable(node):
+    assert isinstance(node, gast.Call)
+    api_name = utils.ast_to_source_code(node.func).strip()
+
+    if utils.is_dygraph_api(node):
+        return api_name.endswith("to_variable")
+
+    if utils.is_paddle_api(node):
+        return api_name.endswith("to_tensor")
+
+    return False
+
+
+def to_assign_node(node):
+    # Transform dygraph api `fluid.dygraph.to_variable` alias `paddle.to_tensor` to static api `fluid.layers.assign`.
+    # NOTE:
+    #   1. Api `to_variable` supports data type {float16, float32, float64, int16, int32, int64, uint8, uint16},
+    #   but api `assign` only supports {float32, float64, int32, int64, bool};
+    #   2. If the input of api `assign` is numpy.ndarray, its size cannot be greater than 1024 * 1024.
+
+    assert isinstance(node, gast.Call)
+    assign_api = gast.parse('fluid.layers.assign').body[0].value
+    node.func = assign_api
+
+    if node.args:
+        node.args = [node.args[0]]
+        node.keywords = []
+    else:
+        for idx, kw in enumerate(node.keywords):
+            if kw.arg == 'value' or kw.arg == 'data':
+                node.keywords[idx].arg = 'input'
+                node.keywords = [node.keywords[idx]]
+                node.args = []
+                break
+    return node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
index 75cb65085846d672d2488c98bf6ad625ac12e78b..c52872b15016169504359b54ad5a40360e244ce0 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -98,8 +98,15 @@ class TranslatorLogger(object):
         return level == self.transformed_code_level
 
     def has_verbosity(self, level):
+        """
+        Checks whether the verbosity level set by the user is greater than or equal to the log level.
+        Args:
+            level(int): The level of log.
+        Returns:
+            True if the verbosity level set by the user is greater than or equal to the log level, otherwise False.
+        """
         level = self.check_level(level)
-        return level >= self.verbosity_level
+        return self.verbosity_level >= level
 
     def error(self, msg, *args, **kwargs):
         self.logger.error(msg, *args, **kwargs)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index ad7d6dfd3f96a710015456453de57dc0eb58f94d..cb489af44d0adc7da377f73a3205c3c264769b4d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -24,6 +24,7 @@ import warnings
 
 import gast
 from paddle.fluid import framework
+from paddle.fluid import in_dygraph_mode
 from paddle.fluid.dygraph import layers
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.layers.utils import flatten
@@ -32,6 +33,7 @@ from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.dygraph.dygraph_to_static import DygraphToStaticAst
 from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
 from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import create_and_update_origin_info_map
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import update_op_callstack_with_origin_info
@@ -283,13 +285,21 @@ class StaticLayer(object):
         Return:
             Outputs of decorated function.
         """
+
         # 1. call dygraph function directly if not enable `declarative`
         if not self._program_trans.enable_declarative:
-            warnings.warn(
-                "The decorator '@paddle.jit.to_static' doesn't work when setting ProgramTranslator.enable=False. "
+            logging_utils.warn(
+                "The decorator '@paddle.jit.to_static' does NOT work when setting ProgramTranslator.enable=False. "
                 "We will just return dygraph output.")
             return self._call_dygraph_function(*args, **kwargs)
 
+        if not in_dygraph_mode() and self._program_trans.enable_declarative:
+            raise RuntimeError(
+                "Failed to run the callable object {} decorated by '@paddle.jit.to_static', "
+                "because it does NOT in dynamic mode. Please disable the static mode to enter dynamic mode with the "
+                "following API: paddle.disable_static().".format(
+                    self.dygraph_function))
+
         # 2. trace ops from dygraph layers and cache the generated program.
         args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs)
         try:
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index f344ad2f7d7af00e6037b7552e258bf5c796a3b8..86593dc24aa8bda7906aab2001e8bd285f64288a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -136,9 +136,12 @@ def is_api_in_module(node, module_prefix):
         #  import_str = "".join(import_statements)
         import paddle
         import paddle.fluid as fluid
+        import paddle.fluid.dygraph as dygraph
         import paddle.fluid.layers as layers
+
         from paddle.fluid.dygraph import to_variable
-        import paddle.fluid.dygraph as dygraph
+        from paddle import to_tensor
+
         return eval("_is_api_in_module_helper({}, '{}')".format(func_str,
                                                                 module_prefix))
     except NameError:
@@ -146,15 +149,18 @@ def is_api_in_module(node, module_prefix):
 
 
 def is_dygraph_api(node):
+
     # Note: A api in module dygraph_to_static is not a real dygraph api.
     if is_api_in_module(node, "paddle.fluid.dygraph.dygraph_to_static"):
         return False
 
+    # TODO(liym27): A better way to determine whether it is a dygraph api.
+    #  Consider the decorator @dygraph_only
     return is_api_in_module(node, "paddle.fluid.dygraph")
 
 
 def is_paddle_api(node):
-    return is_api_in_module(node, "paddle.fluid")
+    return is_api_in_module(node, "paddle")
 
 
 # Is numpy_api cannot reuse is_api_in_module because of numpy module problem
@@ -233,14 +239,6 @@ def _add_keywords_to(node, dygraph_api_name):
     return
 
 
-def is_to_variable(node):
-    assert isinstance(node, gast.Call)
-    if is_dygraph_api(node):
-        api_name = ast_to_source_code(node.func).strip()
-        return api_name.endswith("to_variable")
-    return False
-
-
 def to_static_ast(node, class_node):
     assert isinstance(node, gast.Call)
     assert isinstance(class_node, gast.Call)
@@ -268,29 +266,6 @@ def to_static_ast(node, class_node):
     return node
 
 
-def to_assign_node(node):
-    # Transform dygraph api `fluid.dygraph.to_variable` to static api `fluid.layers.assign`.
-    # NOTE:
-    #   1. Api `to_variable` supports data type {float16, float32, float64, int16, int32, int64, uint8, uint16},
-    #   but api `assign` only supports {float32, float64, int32, int64, bool};
-    #   2. If the input of api `assign` is numpy.ndarray, its size cannot be greater than 1024 * 1024.
-    assert isinstance(node, gast.Call)
-    assign_api = gast.parse('fluid.layers.assign').body[0].value
-    node.func = assign_api
-
-    if node.args:
-        node.args = [node.args[0]]
-        node.keywords = []
-    else:
-        for idx, kw in enumerate(node.keywords):
-            if kw.arg == 'value':
-                node.keywords[idx].arg = 'input'
-                node.keywords = [node.keywords[idx]]
-                node.args = []
-                break
-    return node
-
-
 def update_args_of_func(node, dygraph_node, method_name):
     assert isinstance(node, gast.Call)
     if method_name not in ["__init__", "forward"]:
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 7f3d450a49c7d3fcc9ca1d3c2d7c5eb732671c6c..1d2ea142c7d5f2e653e446986a39d1bc155006f0 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -488,6 +488,15 @@ def _load_persistable_vars(model_path,
     return load_var_dict
 
 
+# NOTE(chenweihang): to adapt paddle.load to get state_dict
+def _remove_varname_suffix(var_dict, program_holder):
+    no_suffix_var_dict = dict()
+    for var_name in var_dict:
+        no_suffix_name = program_holder._suffix_varname_dict[var_name]
+        no_suffix_var_dict[no_suffix_name] = var_dict[var_name]
+    return no_suffix_var_dict
+
+
 def _construct_program_holders(model_path, model_filename=None):
     # make sure the path has been checked
     program_holder_dict = dict()
@@ -517,7 +526,8 @@ def _construct_program_holders(model_path, model_filename=None):
 def _construct_params_and_buffers(model_path,
                                   programs,
                                   separate_params=False,
-                                  params_filename=None):
+                                  params_filename=None,
+                                  append_suffix=True):
     var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
     if os.path.exists(var_info_path):
         var_dict = _load_persistable_vars(model_path, var_info_path,
@@ -526,6 +536,10 @@ def _construct_params_and_buffers(model_path,
     else:
         var_dict = _load_persistable_vars_by_program(
             model_path, programs['forward'], params_filename)
+
+    if not append_suffix:
+        var_dict = _remove_varname_suffix(var_dict, programs['forward'])
+
     return var_dict
 
 
@@ -542,89 +556,92 @@ class TranslatedLayer(layers.Layer):
         .. code-block:: python
 
             import numpy as np
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Linear
-            from paddle.fluid.dygraph import declarative
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            BATCH_SIZE = 32
-            BATCH_NUM = 20
+            BATCH_SIZE = 16
+            BATCH_NUM = 4
+            EPOCH_NUM = 4
 
-            def random_batch_reader():
-                def _get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
+            IMAGE_SIZE = 784
+            CLASS_NUM = 10
 
-                def __reader__():
-                    for _ in range(BATCH_NUM):
-                        batch_image, batch_label = _get_random_images_and_labels(
-                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                        yield batch_image, batch_label
+            # define a random dataset
+            class RandomDataset(paddle.io.Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
 
-                return __reader__
+                def __getitem__(self, idx):
+                    image = np.random.random([IMAGE_SIZE]).astype('float32')
+                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                    return image, label
+
+                def __len__(self):
+                    return self.num_samples
 
-            class LinearNet(fluid.dygraph.Layer):
-                def __init__(self, in_size, out_size):
+            class LinearNet(nn.Layer):
+                def __init__(self):
                     super(LinearNet, self).__init__()
-                    self._linear = Linear(in_size, out_size)
+                    self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
 
-                @declarative
+                @paddle.jit.to_static
                 def forward(self, x):
                     return self._linear(x)
 
+            def train(layer, loader, loss_fn, opt):
+                for epoch_id in range(EPOCH_NUM):
+                    for batch_id, (image, label) in enumerate(loader()):
+                        out = layer(image)
+                        loss = loss_fn(out, label)
+                        loss.backward()
+                        opt.step()
+                        opt.clear_grad()
+                        print("Epoch {} batch {}: loss = {}".format(
+                            epoch_id, batch_id, np.mean(loss.numpy())))
+
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            place = paddle.CPUPlace()
+            paddle.disable_static(place) 
 
             # 1. train & save model.
-            # create network
-            net = LinearNet(784, 1)
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-            # create data loader
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
-            # train
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
 
-                cost = net(img)
+            # create network
+            layer = LinearNet()
+            loss_fn = nn.CrossEntropyLoss()
+            adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
 
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
+            # create data loader
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+            loader = paddle.io.DataLoader(dataset,
+                places=place,
+                batch_size=BATCH_SIZE,
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
 
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                net.clear_gradients()
+            # train
+            train(layer, loader, loss_fn, adam)
 
+            # save
             model_path = "linear.example.model"
-            fluid.dygraph.jit.save(
-                layer=net,
-                model_path=model_path,
-                input_spec=[img])
+            paddle.jit.save(layer, model_path)
 
             # 2. load model as TranslatedLayer
-            translated_layer = fluid.dygraph.jit.load(model_path)
+
+            # load
+            translated_layer = paddle.jit.load(model_path)
+
             # inference
             translated_layer.eval()
-            x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
+            x = paddle.randn([1, IMAGE_SIZE], 'float32')
             pred = translated_layer(x)
+
             # fine-tune
             translated_layer.train()
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=translated_layer.parameters())
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
-
-                cost = translated_layer(img)
+            adam = opt.Adam(learning_rate=0.001, parameters=translated_layer.parameters())
+            train(translated_layer, loader, loss_fn, adam)
 
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
-
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                translated_layer.clear_gradients()
     """
 
     def __init__(self, programs, persistable_vars):
@@ -685,7 +702,7 @@ class TranslatedLayer(layers.Layer):
         # 1. load program desc & construct _ProgramHolder
         programs = _construct_program_holders(model_path, model_filename)
 
-        # 2. load layer parameters & parameter attributes
+        # 2. load layer parameters & buffers
         persistable_vars = _construct_params_and_buffers(
             model_path, programs, separate_params, params_filename)
 
@@ -800,3 +817,107 @@ class TranslatedLayer(layers.Layer):
 
     def eval(self):
         self._is_test = True
+
+    def program(self, method_name='forward'):
+        """
+        Gets translated program of specified method.
+
+        Args:
+            - method_name (string): mehtod name corresponding to the program
+                to be obtained. Default: 'forward'.
+        
+        Returns:
+            Program
+
+        Examples:
+            .. code-block:: python
+            
+                import numpy as np
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
+
+                BATCH_SIZE = 16
+                BATCH_NUM = 4
+                EPOCH_NUM = 4
+
+                IMAGE_SIZE = 784
+                CLASS_NUM = 10
+
+                # define a random dataset
+                class RandomDataset(paddle.io.Dataset):
+                    def __init__(self, num_samples):
+                        self.num_samples = num_samples
+
+                    def __getitem__(self, idx):
+                        image = np.random.random([IMAGE_SIZE]).astype('float32')
+                        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                        return image, label
+
+                    def __len__(self):
+                        return self.num_samples
+
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+
+                    @paddle.jit.to_static
+                    def forward(self, x):
+                        return self._linear(x)
+
+                def train(layer, loader, loss_fn, opt):
+                    for epoch_id in range(EPOCH_NUM):
+                        for batch_id, (image, label) in enumerate(loader()):
+                            out = layer(image)
+                            loss = loss_fn(out, label)
+                            loss.backward()
+                            opt.step()
+                            opt.clear_grad()
+                            print("Epoch {} batch {}: loss = {}".format(
+                                epoch_id, batch_id, np.mean(loss.numpy())))
+
+                # enable dygraph mode
+                place = paddle.CPUPlace()
+                paddle.disable_static(place) 
+
+                # create network
+                layer = LinearNet()
+                loss_fn = nn.CrossEntropyLoss()
+                adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
+
+                # create data loader
+                dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+                loader = paddle.io.DataLoader(dataset,
+                    places=place,
+                    batch_size=BATCH_SIZE,
+                    shuffle=True,
+                    drop_last=True,
+                    num_workers=2)
+
+                # train
+                train(layer, loader, loss_fn, adam)
+
+                # save
+                model_path = "linear.example.model"
+                paddle.jit.save(layer, model_path)
+
+                # load
+                translated_layer = paddle.jit.load(model_path)
+
+                # get program
+                program = translated_layer.program()
+        """
+        # 1. get program holder
+        program_holder = self._program_holder_dict.get(method_name, None)
+        if program_holder is None:
+            raise ValueError(
+                "The method `%s` is not exists in loaded TranslatedLayer." %
+                method_name)
+
+        # 2. get inference program desc
+        program_desc = program_holder.infer_program
+
+        # 3. construct program
+        program = _build_program_by_desc(program_desc)
+        return program
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index f67b79b91f7da235697d920cf0dfe376e88ab93e..9f4ec2b55bc6b56fc796d3124edf1ec0deb3f23e 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -293,6 +293,8 @@ class SaveLoadConfig(object):
         self._model_filename = None
         self._params_filename = None
         self._separate_params = False
+        # used for `paddle.load`
+        self._keep_name_table = False
 
         # NOTE: Users rarely use following configs, so these configs are not open to users,
         # reducing user learning costs, but we retain the configuration capabilities
@@ -600,6 +602,54 @@ class SaveLoadConfig(object):
                 % type(value))
         self._separate_params = value
 
+    @property
+    def keep_name_table(self):
+        """
+        Configures whether keep ``structured_name -> parameter_name`` dict in loaded state dict.
+        This dict is the debugging information saved when call `paddle.save`. 
+        It is generally only used for debugging and does not affect the actual training or inference. 
+        By default, it will not be retained in `paddle.load` result. Default: False.
+        
+        .. note::
+            Only used for ``paddle.load``.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+            
+                paddle.disable_static()
+
+                linear = paddle.nn.Linear(5, 1)
+
+                state_dict = linear.state_dict()
+                paddle.save(state_dict, "paddle_dy")
+
+                configs = paddle.SaveLoadConfig()
+                configs.keep_name_table = True
+                para_state_dict, _ = paddle.load("paddle_dy", configs)
+
+                print(para_state_dict)
+                # the name_table is 'StructuredToParameterName@@'
+                # {'bias': array([0.], dtype=float32), 
+                #  'StructuredToParameterName@@': 
+                #     {'bias': u'linear_0.b_0', 'weight': u'linear_0.w_0'}, 
+                #  'weight': array([[ 0.04230034],
+                #     [-0.1222527 ],
+                #     [ 0.7392676 ],
+                #     [-0.8136974 ],
+                #     [ 0.01211023]], dtype=float32)}
+        """
+        return self._keep_name_table
+
+    @keep_name_table.setter
+    def keep_name_table(self, value):
+        if not isinstance(value, bool):
+            raise TypeError(
+                "The SaveLoadConfig.keep_name_table should be bool value, but received input's type is %s."
+                % type(value))
+        self._keep_name_table = value
+
 
 @switch_to_static_graph
 def save(layer, model_path, input_spec=None, configs=None):
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 1ef719b9da187be659d9c898ec996b5ad0c0d8a6..7075024369f328b59ecac014b0960fc26f447ff2 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -29,6 +29,9 @@ from .layer_object_helper import LayerObjectHelper
 from .base import program_desc_tracing_guard, param_guard
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
+from paddle.fluid.executor import Executor, global_scope
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.framework import _current_expected_place as _get_device
 
 __all__ = ['Layer']
 
@@ -797,7 +800,7 @@ class Layer(core.Layer):
                 raise ValueError(
                     "super(YourLayer, self).__init__() should be called first")
             if len(self._loaddict_holder) > 0:
-                assert value.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in stat_dict".format(
+                assert value.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
                     value.name)
 
                 value.set_value(self._loaddict_holder[value.name])
@@ -943,12 +946,13 @@ class Layer(core.Layer):
                     destination = destination_temp
         return destination
 
-    def set_dict(self,
-                 stat_dict,
-                 include_sublayers=True,
-                 use_structured_name=True):
+    @framework.deprecate_stat_dict
+    def set_state_dict(self,
+                       state_dict,
+                       include_sublayers=True,
+                       use_structured_name=True):
         '''
-        Set parameters and persistable buffers from stat_dict. All the parameters and buffers will be reset by the tensor in the stat_dict
+        Set parameters and persistable buffers from state_dict. All the parameters and buffers will be reset by the tensor in the state_dict
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters and persistable buffers.
@@ -961,72 +965,67 @@ class Layer(core.Layer):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
+                import paddle
+                
+                paddle.disable_static()
+                
+                emb = paddle.nn.Embedding([10, 10])
 
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
-
-                    emb.set_dict( para_state_dict )
+                state_dict = emb.state_dict()
+                paddle.save(state_dict, "paddle_dy")
+                
+                para_state_dict, _ = paddle.load("paddle_dy")
 
-        '''
-        self.load_dict(
-            stat_dict,
-            include_sublayers=include_sublayers,
-            use_structured_name=use_structured_name)
+                emb.set_state_dict(para_state_dict)
 
-    def load_dict(self,
-                  stat_dict,
-                  include_sublayers=True,
-                  use_structured_name=True):
         '''
-        Set parameters and persistable buffers from stat_dict. All the parameters and persistabl buffers will be reset by the tensor in the stat_dict
 
-        This api will be Deprecated. Please use set_dict
-
-        Parameters:
-            state_dict(dict) : Dict contains all the parameters and persistable buffers.
-            include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
-            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key.
-                                                  Default: True
-        Returns:
-            None
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
-
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
-
-                    emb.load_dict( para_state_dict )
-
-        '''
-
-        inner_state_dict = self.state_dict()
+        def _check_match(key, param):
+            state = state_dict.get(key, None)
+            if state is None:
+                raise ValueError("{} is not found in the provided dict.".format(
+                    key))
+            if list(state.shape) != list(param.shape):
+                raise ValueError(
+                    "{} receives a shape {}, but the expected shape is {}.".
+                    format(key, list(state.shape), list(param.shape)))
+            return param, state
+
+        matched_param_state = []
+        for key, param in self.state_dict().items():
+            key_name = key if use_structured_name else param.name
+            try:
+                match_res = _check_match(key_name, param)
+                matched_param_state.append(match_res)
+            except ValueError as err:
+                warnings.warn(("Skip loading for {}. ".format(key) + str(err)))
+
+        if in_dygraph_mode():
+            for param, state in matched_param_state:
+                param.set_value(state)
+        else:
 
-        for name, param_or_buffer in inner_state_dict.items():
-            key_name = name if use_structured_name else param_or_buffer.name
-            if key_name in stat_dict:
-                param_or_buffer.set_value(stat_dict[key_name])
-            else:
-                raise RuntimeError(
-                    "Parameter or persistable buffer not found, Can't find [ {} ] in stat_dict"
-                    "use_structured_name is set to [{}]".format(
-                        key_name, use_structured_name))
-        unused_para_list = []
-        for k, v in stat_dict.items():
-            if k not in inner_state_dict:
-                unused_para_list.append(k)
-        if len(unused_para_list) > 0:
-            warnings.warn(
-                "Variables [ {} ] are not used, because not included in layers state_dict".
-                format(" ".join(unused_para_list)))
+            def _set_var(var, ndarray):
+                t = global_scope().find_var(var.name).get_tensor()
+                p = t._place()
+                if p.is_cpu_place():
+                    place = core.CPUPlace()
+                elif p.is_cuda_pinned_place():
+                    place = core.CUDAPinnedPlace()
+                else:
+                    p = core.Place()
+                    p.set_place(t._place())
+                    place = core.CUDAPlace(p.gpu_device_id())
+                t.set(ndarray, place)
+
+            executor = Executor(_get_device())._default_executor
+            # restore parameter states
+            core._create_loaded_parameter(
+                [param for param, state in matched_param_state],
+                global_scope(), executor)
+            for param, state in matched_param_state:
+                _set_var(param, state)
+
+    # [aliases] Compatible with old method names
+    set_dict = set_state_dict
+    load_dict = set_state_dict
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index cce383be7e22cd066199f814db80a75367862b82..cd6af6fd5b575e8188088bde9e8944ab94c7e0f8 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -97,7 +97,7 @@ class LearningRateDecay(object):
         """
         self.keys = ['step_num']
 
-    def set_dict(self, state_dict):
+    def set_state_dict(self, state_dict):
         """
         Loads the schedulers state.
         """
@@ -114,6 +114,9 @@ class LearningRateDecay(object):
                 "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
             )
 
+    # [aliases] Compatible with old method names
+    set_dict = set_state_dict
+
     def step(self):
         raise NotImplementedError()
 
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index bb55c6725e6a62f2cef393fd34b249c217be0c54..8c4109674200bf97354444f92f00b13e053152a0 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -41,7 +41,7 @@ def monkey_patch_math_varbase():
     The difference is, in dygraph mode, use auto-generated op functions for better performance.
     """
 
-    @no_grad()
+    @no_grad
     def create_tensor(value, dtype, shape):
         out = _varbase_creator(dtype=dtype)
         out = core.ops.fill_constant(out, 'dtype', dtype, 'shape', shape,
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 6fbf3bfe76f6ef1f699ef34bb7efe60247c8531c..472022bced7e3e2dd11d301501ebaec75e5e412a 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -445,7 +445,7 @@ class DataParallel(layers.Layer):
                 self._reshape_inplace(x=g_var, shape=g_shape)
                 assert g_var.shape == g_shape
 
-    @no_grad()
+    @no_grad
     def apply_collective_grads(self):
         """
         AllReduce the Parameters' gradient.
@@ -587,12 +587,13 @@ class DataParallel(layers.Layer):
             include_sublayers=include_sublayers,
             structured_name_prefix=structured_name_prefix)
 
-    def set_dict(self,
-                 stat_dict,
-                 include_sublayers=True,
-                 use_structured_name=True):
+    @framework.deprecate_stat_dict
+    def set_state_dict(self,
+                       state_dict,
+                       include_sublayers=True,
+                       use_structured_name=True):
         '''
-        Set parameters of self._layers from stat_dict. All the parameters of self._layers will be reset by the tensor in the stat_dict
+        Set parameters of self._layers from state_dict. All the parameters of self._layers will be reset by the tensor in the state_dict
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters
@@ -605,62 +606,27 @@ class DataParallel(layers.Layer):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    strategy=fluid.dygraph.prepare_context()
-                    emb = fluid.dygraph.Embedding([10, 10])
-                    emb = fluid.dygraph.DataParallel(emb, strategy)
-
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
-
-                    emb.set_dict( para_state_dict )
+                import paddle   
 
-        '''
-
-        self._layers.set_dict(
-            stat_dict,
-            include_sublayers=include_sublayers,
-            use_structured_name=use_structured_name)
-
-    def load_dict(self,
-                  stat_dict,
-                  include_sublayers=True,
-                  use_structured_name=True):
-        '''
-        Set parameters of self._layers from stat_dict. All the parameters of self._layers will be reset by the tensor in the stat_dict
-
-        This api will be Deprecated. Please use set_dict
-
-        Parameters:
-            state_dict(dict) : Dict contains all the parameters
-            include_sublayers(bool, optional) : If true, also include the parameters from sublayers. Default: True
-            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter name as key.
-                                                  Default: True
-        Returns:
-            None
+                paddle.disable_static()
 
-        Examples:
-            .. code-block:: python
+                emb = paddle.nn.Embedding([10, 10])
+                emb = fluid.dygraph.DataParallel(emb, strategy)
 
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    strategy=fluid.dygraph.prepare_context()
-                    emb = fluid.dygraph.Embedding([10, 10])
-                    emb = fluid.dygraph.DataParallel(emb, strategy)
+                state_dict = emb.state_dict()
+                paddle.save(state_dict, "paddle_dy")
 
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
+                para_state_dict, _ = paddle.load("paddle_dy")
 
-                    emb.load_dict( para_state_dict )
+                emb.set_state_dict(para_state_dict)
 
         '''
 
-        self._layers.load_dict(
-            stat_dict,
+        self._layers.set_state_dict(
+            state_dict,
             include_sublayers=include_sublayers,
             use_structured_name=use_structured_name)
+
+    # [aliases] Compatible with old method names
+    set_dict = set_state_dict
+    load_dict = set_state_dict
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index fc4e91aad4fff1db325e17828d26ccd94c164c3d..5281df9ead10acea5ae8656dcc4a0eed14fb3e83 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -36,6 +36,7 @@ from . import core
 from . import unique_name
 import paddle.version as fluid_version
 import warnings
+import functools
 
 __all__ = [
     'Program',
@@ -238,6 +239,25 @@ def _fake_interface_only_(func):
     return __impl__
 
 
+# NOTE(chenweihang): There is argument name typo (stat_dict, correct name is state_dict) 
+# in fluid api Layer.set_dict, Optimizer.load, in order to correct the argument without 
+# introducing compatibility issues, add this decorator
+# NOTE(chenweihang): not using `wrap_decorator` here is because `wrap_decorator` will
+# move kwargs to args, which doesn't work in this decorate case
+def deprecate_stat_dict(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if 'stat_dict' in kwargs:
+            warnings.warn(
+                "The argument `stat_dict` has deprecated, please change it to `state_dict`.",
+                DeprecationWarning)
+            kwargs['state_dict'] = kwargs['stat_dict']
+            kwargs.pop('stat_dict')
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
 dygraph_not_support = wrap_decorator(_dygraph_not_support_)
 dygraph_only = wrap_decorator(_dygraph_only_)
 fake_interface_only = wrap_decorator(_fake_interface_only_)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index 378c8fc23d7528766ca9eca062c87a4511e32b46..216478479a7cfdcffac5f21855d0974309842c89 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -42,6 +42,9 @@ op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
 OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
 
+SPARSE_OP_LIST = ["lookup_table", "lookup_table_v2"]
+SPARSE_OP_TYPE_DICT = {"lookup_table": "W", "lookup_table_v2": "W"}
+
 
 def _get_lr_ops(program):
     lr_ops = []
@@ -66,7 +69,7 @@ def _has_global_step(lr_ops):
 
 
 def is_sparse_op(op):
-    if op.type == "lookup_table" and op.attr('is_sparse') is True and op.attr(
+    if op.type in SPARSE_OP_LIST and op.attr('is_sparse') is True and op.attr(
             'is_distributed') is False:
         return True
 
@@ -78,7 +81,7 @@ def is_sparse_op(op):
 
 
 def is_distributed_sparse_op(op):
-    if op.type == "lookup_table" and op.attr('is_distributed') is True:
+    if op.type in SPARSE_OP_LIST and op.attr('is_distributed') is True:
         return True
 
     if op.type == "distributed_lookup_table" and op.attr(
@@ -802,11 +805,10 @@ class CompileTimeStrategy(object):
 
         def _get_sparse_varnames():
             varnames = []
-            op_types = {"lookup_table": "W"}
             for op in origin_program.global_block().ops:
-                if op.type in op_types.keys() \
+                if op.type in SPARSE_OP_TYPE_DICT.keys() \
                         and op.attr('remote_prefetch') is True:
-                    param_name = op.input(op_types[op.type])[0]
+                    param_name = op.input(SPARSE_OP_TYPE_DICT[op.type])[0]
                     varnames.append(param_name)
 
             return list(set(varnames))
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 201b3863a4b6d6d5fed036d85b2103f5defe61f0..4543af9820e8c9326098fa254494ca1c896d3b12 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -1,3 +1,4 @@
+# -*- coding: UTF-8 -*-
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -40,6 +41,8 @@ LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
 OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
 op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
+SPARSE_OP_TYPE_DICT = {"lookup_table": "W", "lookup_table_v2": "W"}
+
 DEVICE_LIST = ["cpu", "gpu", "xpu"]
 COMMUNICATE_OPS_TYPE = ["send", "recv", "fetch_barrier", "send_barrier"]
 DEFAULT_DEVICE = 'cpu'
@@ -81,11 +84,10 @@ def distributed_ops_pass(program, config):
 
     def _get_pull_sparse_ops(_program):
         pull_sparse_ops = {}
-        op_types = {"lookup_table": "W"}
         for op in _program.global_block().ops:
-            if op.type in op_types.keys() \
+            if op.type in SPARSE_OP_TYPE_DICT.keys() \
                     and op.attr('remote_prefetch') is True:
-                param_name = op.input(op_types[op.type])[0]
+                param_name = op.input(SPARSE_OP_TYPE_DICT[op.type])[0]
                 ops = pull_sparse_ops.get(param_name, [])
                 ops.append(op)
                 pull_sparse_ops[param_name] = ops
@@ -101,6 +103,7 @@ def distributed_ops_pass(program, config):
             w = program.global_block().vars[ops[0].input("W")[0]]
             padding_idx = ops[0].attr("padding_idx")
             is_distributed = ops[0].attr("is_distributed")
+            op_type = ops[0].type
 
             outputs = [
                 program.global_block().vars[op.output("Out")[0]] for op in ops
@@ -149,7 +152,8 @@ def distributed_ops_pass(program, config):
                         "is_distributed": is_distributed,
                         "pserver_num": len(pserver_endpoints),
                         "padding_idx": padding_idx,
-                        "trainer_id": trainer_id
+                        "trainer_id": trainer_id,
+                        "lookup_table_version": op_type
                     })
             else:
                 raise ValueError(
@@ -438,7 +442,23 @@ def find_heter_ops(program, default_device="cpu"):
 
 def create_heter_program(program, config, heter_program, heter_ops,
                          block_var_detail, current_device):
-    # add heter op
+
+    # This function mainly includes the following contents:
+    # 1. For every heter block:
+    #     a) copy heter device op from origin program
+    #     b) create variables which belong to heter op：
+    #         -> if variable is persistable, clone it in global_scope
+    #         -> if variable is temp, create it in heter block
+    #     c) create communicate related op as follow:
+    #         joint_var.0_1 -> slice -> reshape -> origin_var
+    #         origin_var -> origin_program
+    #         reshape -> concat -> joint_var.1_2
+    #     d) copy send op from origin program for var@grad which loacted in current heter block
+    #     e) re-check every op in current blcok if its device is not current heter devie
+    # 2. Create send op for step counter in last heter-block
+    # 3. Create Listen&Serv OP for distributed training
+    # 4. update CompileTimeStrategy for heter_program
+
     optimizer_block = []
     grad_to_block_id = []
     send_grad_var_list = []
@@ -450,17 +470,10 @@ def create_heter_program(program, config, heter_program, heter_ops,
         for _, op in enumerate(heter_block_ops):
             block_append_op(heter_program, program, heter_block, op)
 
-            # add relate variables
-            inputs = _get_input_map_from_op(program.global_block().vars, op)
-            add_vars_by_op_map(inputs, heter_program)
-
-            outputs = _get_output_map_from_op(program.global_block().vars, op)
-            add_vars_by_op_map(outputs, heter_program)
-
         entrance_vars = block_var_detail[index]["entrance"]
-        add_vars_by_var_list(entrance_vars, program, heter_program)
+        add_vars_by_var_list(entrance_vars, program, heter_program, heter_block)
         exit_vars = block_var_detail[index]["exit"]
-        add_vars_by_var_list(exit_vars, program, heter_program)
+        add_vars_by_var_list(exit_vars, program, heter_program, heter_block)
 
         comm_info = get_communicate_var_info(program, index, entrance_vars,
                                              exit_vars)
@@ -468,13 +481,13 @@ def create_heter_program(program, config, heter_program, heter_ops,
         grad_to_block_id.append(comm_info["block_input_var_name"] + ":" + str(
             heter_block.idx))
 
-        # create slice op
         first_op_index = 0
 
         get_type_var_name = comm_info["input_var_reshape_name"][0].split(
             ".input_reshape@Heter")[0]
-        get_type_var = heter_program.global_block().vars[get_type_var_name]
+        get_type_var = heter_block.vars[get_type_var_name]
 
+        # create slice op
         insert_recv_slice_op(
             heter_program, heter_block, first_op_index,
             comm_info["block_input_var_name"],
@@ -484,6 +497,13 @@ def create_heter_program(program, config, heter_program, heter_ops,
                 for i in range(len(comm_info["input_var_reshape_dim"]))
             ])
         first_op_index += len(comm_info["input_var_reshape_dim"])
+
+        heter_program.global_block().create_var(
+            name=comm_info["block_input_var_name"],
+            shape=(-1, sum(comm_info["input_var_reshape_dim"])),
+            dtype=get_type_var.dtype,
+            type=get_type_var.type)
+
         # create reshape op
         for i in range(len(comm_info["input_var_reshape_name"])):
             var_name = entrance_vars[i]
@@ -511,13 +531,14 @@ def create_heter_program(program, config, heter_program, heter_ops,
                               comm_info["block_output_var_name"],
                               [-1, sum(comm_info["output_var_reshape_dim"])])
         check_op_device(heter_block, current_device)
+
+        # add send op
         send_grad_var_list = send_grad_var_list + add_heter_send_op(
             program, heter_program, heter_block, block_var_detail[index])
 
     # add step conter
     send_input_vars = []
     dummy_output = []
-    trainer_id = config.get_role_id()
     pserver_endpoints = config.get_ps_endpoints()
     optimizer_block[-1].append_op(
         type="send",
@@ -552,7 +573,6 @@ def create_heter_program(program, config, heter_program, heter_ops,
     # append the listen_and_serv op
     heter_program.global_block().append_op(
         type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=attrs)
-
     check_heter_compile_time_strategy(program, config, send_grad_var_list)
 
 
@@ -571,6 +591,16 @@ def check_heter_compile_time_strategy(program, config, send_grad_var_list):
 
 
 def create_trainer_program(program, config, heter_ops, block_var_detail):
+    # This function mainly includes the following contents:
+    # 1. For every heter block in origin program
+    #     a) delete heter op and related variables
+    #     b) add send&recv op
+    #     c) add communicate ops as follows:
+    #         origin_var -> reshape -> concat -> joint_var.0_1
+    #         send&recv op(send joint_var.0_1; recv joint_var.1_2)
+    #         joint_var.1_2 -> slice -> reshape -> origin_var
+    #     d) remove send op which related var@grad is not in trainer program
+    # 2. check every op's device
     for device in heter_ops.keys():
         for heter_block_index in sorted(heter_ops[device]):
             replace_ops_by_communicate_op(program, config, heter_block_index,
@@ -929,19 +959,19 @@ def insert_reshape_op(program,
                       var_name,
                       new_var_name,
                       new_var_shape=None):
-    input_var = program.global_block().vars[var_name]
+    input_var = block.vars[var_name]
 
-    if new_var_name not in program.global_block().vars:
-        out = program.global_block().create_var(
+    if new_var_name not in block.vars:
+        out = block.create_var(
             name=new_var_name,
             shape=new_var_shape,
             dtype=input_var.dtype,
             type=input_var.type)
     else:
-        out = program.global_block().vars[new_var_name]
+        out = block.vars[new_var_name]
         new_var_shape = out.shape
 
-    x_shape = program.global_block().create_var(
+    x_shape = block.create_var(
         name="{}.xshape@Heter".format(var_name), dtype=input_var.dtype)
     block._insert_op(
         index=index,
@@ -954,9 +984,7 @@ def insert_reshape_op(program,
 
 def insert_send_concat_op(program, block, index, var_name_list, new_var_name,
                           new_var_shape):
-    input_var_list = [
-        program.global_block().vars[var_name] for var_name in var_name_list
-    ]
+    input_var_list = [block.vars[var_name] for var_name in var_name_list]
 
     out = program.global_block().create_var(
         name=new_var_name,
@@ -984,14 +1012,14 @@ def insert_recv_slice_op(program, block, index, var_name, var_shape, dtype,
 
     out_list = []
     for i in range(len(new_var_name_list)):
-        if new_var_name_list[i] not in program.global_block().vars:
-            out = program.global_block().create_var(
+        if new_var_name_list[i] not in block.vars:
+            out = block.create_var(
                 name=new_var_name_list[i],
                 shape=new_var_shape_list[i],
                 dtype=input_var.dtype,
                 type=input_var.type)
         else:
-            out = program.global_block().vars[new_var_name_list[i]]
+            out = block.vars[new_var_name_list[i]]
         out_list.append(out)
 
     start_index = 0
@@ -1034,21 +1062,33 @@ def deleter_trainer_useless_var(program):
 
 
 def block_append_op(program, origin_program, block, op):
-    inputs = _get_input_map_from_op(origin_program.global_block().vars, op)
+    merge_ordereddict = origin_program.global_block().vars.copy()
+    merge_ordereddict.update(block.vars)
+    inputs = _get_input_map_from_op(merge_ordereddict, op)
     for key, varlist in six.iteritems(inputs):
         if not isinstance(varlist, list):
             varlist = [varlist]
         for var in varlist:
-            if var.name not in program.global_block().vars:
-                program.global_block()._clone_variable(var)
+            if var.name not in program.global_block(
+            ).vars and var.name not in block.vars:
+                if var.persistable:
+                    program.global_block()._clone_variable(
+                        var, force_persistable=False)
+                else:
+                    block._clone_variable(var, force_persistable=False)
 
     outputs = _get_output_map_from_op(origin_program.global_block().vars, op)
     for key, varlist in six.iteritems(outputs):
         if not isinstance(varlist, list):
             varlist = [varlist]
         for var in varlist:
-            if var.name not in program.global_block().vars:
-                program.global_block()._clone_variable(var)
+            if var.name not in program.global_block(
+            ).vars and var.name not in block.vars:
+                if var.persistable:
+                    program.global_block()._clone_variable(
+                        var, force_persistable=False)
+                else:
+                    block._clone_variable(var, force_persistable=False)
 
     if "_grad" not in op.type:
         # for forward op
@@ -1073,21 +1113,15 @@ def block_append_op(program, origin_program, block, op):
         block._sync_with_cpp()
 
 
-def add_vars_by_op_map(var_map, program):
-    for key, varlist in six.iteritems(var_map):
-        if not isinstance(varlist, list):
-            varlist = [varlist]
-        for i in range(len(varlist)):
-            var = varlist[i]
-            if var.name not in program.global_block().vars:
-                program.global_block()._clone_variable(var)
-
-
-def add_vars_by_var_list(var_name_list, origin_program, program):
+def add_vars_by_var_list(var_name_list, origin_program, program, block):
     for var_name in var_name_list:
         if var_name not in program.global_block().vars:
             var = origin_program.global_block().vars[var_name]
-            program.global_block()._clone_variable(var)
+            if var.persistable:
+                program.global_block()._clone_variable(
+                    var, force_persistable=False)
+            else:
+                block._clone_variable(var, force_persistable=False)
 
 
 def get_varlist_from_op_map(var_map):
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index 15a3022f932f4a702bf7f94ed936468b6a06e94e..529588c0846b5a90a842c398bbb4409a04f35d53 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -129,6 +129,7 @@ def one_hot(input, depth, allow_out_of_range=False):
     return one_hot_out
 
 
+@deprecated(since='2.0.0', update_to='paddle.nn.functional.embedding')
 def embedding(input,
               size,
               is_sparse=False,
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 19822e410c71aa993e2d90a92c57c3522023ad81..db556913384785e1f11ba05dcc524ef1f1de92ab 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -147,8 +147,10 @@ class LayerHelper(LayerHelperBase):
 
         if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
             act['use_cudnn'] = self.kwargs.get('use_cudnn')
-        if 'use_mkldnn' in self.kwargs:
-            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
+        use_mkldnn = self.kwargs.get(
+            'use_mkldnn', core.globals().get("FLAGS_use_mkldnn", False))
+        if use_mkldnn:
+            act['use_mkldnn'] = use_mkldnn
         act_type = act.pop('type')
 
         tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 19c46fd21b1cda5f6f3155250fb953ce9a962bb2..9313de8c64fcf4efc1e192ad2826f05f51869bbf 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -367,6 +367,7 @@ def fc(input,
     return helper.append_activation(pre_activation)
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.embedding")
 def embedding(input,
               size,
               is_sparse=False,
@@ -4814,11 +4815,6 @@ def split(input, num_or_sections, dim=-1, name=None):
     Returns:
         list(Tensor): The list of segmented Tensors.
 
-    Raises:
-        TypeError: The data type of ``input`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: ``num_or_sections`` is not int, list or tuple.
-        TypeError: ``dim`` is not int or Tensor. The data type of ``dim`` must be int32 or int64 when it's a Tensor.
-
     Example:
         .. code-block:: python
 
@@ -6102,11 +6098,6 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
     Returns:
         Tensor: A reshaped Tensor with the same data type as ``x``. It is a new tensor variable if ``inplace`` is ``False``, otherwise it is ``x``. If ``act`` is None, return the reshaped tensor variable, otherwise return the activated tensor variable.
 
-    Raises:
-        TypeError: If actual_shape is neither Tensor nor None.
-        ValueError: If more than one elements of ``shape`` is -1.
-        ValueError: If the element of ``shape`` is 0, the corresponding dimension should be less than or equal to the dimension of ``x``.
-        ValueError: If the elements in ``shape`` is negative except -1.
 
     Examples:
         .. code-block:: python
@@ -8255,10 +8246,6 @@ def gather(input, index, overwrite=True):
     Returns:
         output (Tensor): The output is a tensor with the same rank as input.
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must to be one of float16, float32, float64, int32, int64, uint8.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
-
     Examples:
 
         .. code-block:: python
@@ -8348,10 +8335,6 @@ def gather_nd(input, index, name=None):
 
     Returns:
         output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
-    
-    Raises:
-        TypeError: ``input`` must be a Tensor and the data type of ``input`` must be one of float32, float64, int32 and int64.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be one of int32 and int64.
 
     Examples:
 
@@ -10017,15 +10000,16 @@ def stack(x, axis=0, name=None):
 
 
     Args:
-        x (Variable|list(Variable)): Input :code:`x` can be a single Tensor, a :code:`list` of Tensors.
-                                     If :code:`x` is a :code:`list`, the shapes of all these Tensors
+        x (list(Variable)|tuple(Variable)): Input :code:`x` can be a :code:`list` or :code:`tuple` of Tensors, the shapes of all these Tensors
                                      must be the same. Supposing input is N dims
                                      Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims
                                      Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
                                      Supported data types: float32, float64, int32, int64.
-        axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is :math:`[-(R+1), R+1)`.
-                              R is the first tensor of inputs. If ``axis`` < 0, :math:`axis=axis+rank(x[0])+1`.
-                              The default value of axis is 0.
+        axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
+                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
+                              If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
+        name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
+    
 
     Returns:
         Variable: The stacked Tensor, has same data type with input Tensors. Output dim is :math:`rank(x[0])+1`.
@@ -10043,18 +10027,27 @@ def stack(x, axis=0, name=None):
 
             data = layers.stack([x1,x2], axis=1) # stack according to axis 1, data.shape=[None, 2, 1, 2]
 
-            # stack single Tensor
-            data = layers.stack(x1)  # stack according to axis 0, data.shape=[1, None, 1, 2]
 
     """
     axis = 0 if axis is None else axis
-    if not isinstance(x, list) and not isinstance(x, tuple):
-        x = [x]
 
     if in_dygraph_mode():
         return core.ops.stack(x, 'axis', axis)
 
+    if not isinstance(x, list) and not isinstance(x, tuple):
+        # NOTE:(zhiqiu) Only support Variable as input if the Variable is a LOD_TENSOR_ARRAY create by create_array, array_write, array_read, etc.
+        # In that case, Variable is array of tensors indeed.
+        if isinstance(x, Variable) and x.desc.type(
+        ) == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+            x = [x]
+        else:
+            raise TypeError("The type of '%s' in %s must be %s, but received %s"
+                            % ('x', 'stack',
+                               'list[Tensor], tuple[Tensor] or TensorArray',
+                               type(x)))
+
     helper = LayerHelper('stack', **locals())
+
     out = helper.create_variable_for_type_inference(x[0].dtype)
     if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
@@ -10599,7 +10592,7 @@ def gaussian_random(shape,
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.gaussian_random('shape', shape, 'mean',
                                         float(mean), 'std',
                                         float(std), 'seed', seed, 'dtype',
@@ -10616,7 +10609,7 @@ def gaussian_random(shape,
         'dtype': dtype,
         'use_mkldnn': False
     }
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs,
         attrs=attrs,
         shape=shape,
@@ -12030,6 +12023,8 @@ for func in [
         elementwise_floordiv,
 ]:
     op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
+
+    # insert the c++ doc string on top of python doc string
     func.__doc__ = _generate_doc_string_(
         op_proto,
         additional_args_lines=[
@@ -12047,6 +12042,16 @@ for func in [
             "mkldnn_data_type", "Scale_x", "Scale_y", "Scale_out"
         }) + """\n""" + str(func.__doc__)
 
+    doc_list = func.__doc__.splitlines()
+
+    for idx, val in enumerate(doc_list):
+        if val.startswith("Warning: ") and val.endswith(
+                " instead."
+        ) and "and will be removed in future versions." in val:
+            doc_list.insert(0, doc_list.pop(idx))
+            func.__doc__ = "\n" + "\n".join(i for i in doc_list)
+            break
+
 for func in []:
     op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
     func.__doc__ = _generate_doc_string_(
@@ -12152,13 +12157,10 @@ def logical_and(x, y, out=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([True], dtype=np.bool)
-            y_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.to_tensor(x_data)
-            y = paddle.to_tensor(y_data)
+            x = paddle.to_tensor([True])
+            y = paddle.to_tensor([True, False, True, False])
             res = paddle.logical_and(x, y)
             print(res.numpy()) # [True False True False]
     """
@@ -12271,11 +12273,9 @@ def logical_not(x, out=None, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.to_variable(x_data)
+            x = paddle.to_tensor([True, False, True, False])
             res = paddle.logical_not(x)
             print(res.numpy()) # [False  True False  True]
     """
@@ -15093,7 +15093,7 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.uniform_random('shape', shape, 'min',
                                        float(min), 'max',
                                        float(max), 'seed', seed, 'dtype', dtype)
@@ -15103,7 +15103,7 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
 
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand')
 
     helper = LayerHelper("uniform_random", **locals())
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 84cacea6ba5723f8a06fc87fa9c59d96f802e65a..1efae3ddf1f3422a53f69c4b5b8eeec6183fae96 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -86,13 +86,11 @@ add_sample_code(globals()["sigmoid"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         import paddle.nn.functional as F
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.sigmoid(x)
         print(out.numpy())
         # [0.40131234 0.450166   0.52497919 0.57444252]
@@ -103,13 +101,11 @@ add_sample_code(globals()["logsigmoid"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         import paddle.nn.functional as F
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.logsigmoid(x)
         print(out.numpy())
         # [-0.91301525 -0.79813887 -0.64439666 -0.55435524]
@@ -120,12 +116,10 @@ add_sample_code(globals()["exp"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.exp(x)
         print(out.numpy())
         # [0.67032005 0.81873075 1.10517092 1.34985881]
@@ -136,12 +130,10 @@ add_sample_code(globals()["tanh"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.tanh(x)
         print(out.numpy())
         # [-0.37994896 -0.19737532  0.09966799  0.29131261]
@@ -152,12 +144,10 @@ add_sample_code(globals()["atan"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.atan(x)
         print(out.numpy())
         # [-0.38050638 -0.19739556  0.09966865  0.29145679]
@@ -170,11 +160,10 @@ Examples:
 
         import paddle
         import paddle.nn.functional as F
-        import numpy as np
 
         paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
 
 """)
@@ -183,12 +172,10 @@ add_sample_code(globals()["sqrt"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([0.1, 0.2, 0.3, 0.4])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4])
         out = paddle.sqrt(x)
         print(out.numpy())
         # [0.31622777 0.4472136  0.54772256 0.63245553]
@@ -199,12 +186,10 @@ add_sample_code(globals()["rsqrt"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([0.1, 0.2, 0.3, 0.4])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4])
         out = paddle.rsqrt(x)
         print(out.numpy())
         # [3.16227766 2.23606798 1.82574186 1.58113883]
@@ -215,12 +200,10 @@ add_sample_code(globals()["abs"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.abs(x)
         print(out.numpy())
         # [0.4 0.2 0.1 0.3]
@@ -231,12 +214,10 @@ add_sample_code(globals()["ceil"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.ceil(x)
         print(out.numpy())
         # [-0. -0.  1.  1.]
@@ -247,12 +228,10 @@ add_sample_code(globals()["floor"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.floor(x)
         print(out.numpy())
         # [-1. -1.  0.  0.]
@@ -263,12 +242,10 @@ add_sample_code(globals()["cos"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.cos(x)
         print(out.numpy())
         # [0.92106099 0.98006658 0.99500417 0.95533649]
@@ -279,12 +256,10 @@ add_sample_code(globals()["acos"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.acos(x)
         print(out.numpy())
         # [1.98231317 1.77215425 1.47062891 1.26610367]
@@ -295,12 +270,10 @@ add_sample_code(globals()["sin"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.sin(x)
         print(out.numpy())
         # [-0.38941834 -0.19866933  0.09983342  0.29552021]
@@ -311,12 +284,10 @@ add_sample_code(globals()["asin"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.asin(x)
         print(out.numpy())
         # [-0.41151685 -0.20135792  0.10016742  0.30469265]
@@ -327,12 +298,10 @@ add_sample_code(globals()["cosh"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.cosh(x)
         print(out.numpy())
         # [1.08107237 1.02006676 1.00500417 1.04533851]
@@ -343,12 +312,10 @@ add_sample_code(globals()["sinh"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.sinh(x)
         print(out.numpy())
         # [-0.41075233 -0.201336    0.10016675  0.30452029]
@@ -359,12 +326,10 @@ add_sample_code(globals()["round"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.5, -0.2, 0.6, 1.5])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.5, -0.2, 0.6, 1.5])
         out = paddle.round(x)
         print(out.numpy())
         # [-1. -0.  1.  2.]
@@ -375,12 +340,10 @@ add_sample_code(globals()["reciprocal"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.reciprocal(x)
         print(out.numpy())
         # [-2.5        -5.         10.          3.33333333]
@@ -391,12 +354,10 @@ add_sample_code(globals()["square"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.square(x)
         print(out.numpy())
         # [0.16 0.04 0.01 0.09]
@@ -409,11 +370,10 @@ Examples:
 
         import paddle
         import paddle.nn.functional as F
-        import numpy as np
 
         paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
 
 """)
@@ -424,11 +384,10 @@ Examples:
 
         import paddle
         import paddle.nn.functional as F
-        import numpy as np
 
         paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
 
 """)
@@ -761,11 +720,9 @@ Examples:
     
     .. code-block:: python
     
-        import numpy as np
         import paddle
         paddle.disable_static()
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_tensor(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.erf(x)
         print(out.numpy())
         # [-0.42839236 -0.22270259  0.11246292  0.32862676]
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 77a78eb4a14a0a5ad9be9cff71131ca473106ab8..a90551c1b7b4fd45ae9a0e1cfa225a87db811295 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -29,6 +29,7 @@ from ..data_feeder import check_variable_and_dtype, check_type, check_dtype, con
 from paddle.utils import deprecated
 import numpy
 import warnings
+from .utils import check_shape
 
 __all__ = [
     'create_tensor', 'create_parameter', 'create_global_var', 'cast',
@@ -276,11 +277,6 @@ def concat(input, axis=0, name=None):
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
-    Raises:
-        TypeError: ``input`` must be one of list, tuple or Tensor.
-        TypeError: The data type of ``input`` must be one of bool, float16, float32, float64, int32 and int64. 
-        TypeError: The ``axis`` must be int or Tensor. The dtype of ``axis`` must be int32 or int64 when it's a Tensor.
-        TypeError: All the Tensors in ``input`` must have the same data type.
 
     Returns:
         Tensor: A Tensor with the same data type as ``input``.
@@ -657,12 +653,6 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     Returns:
         Tensor: Tensor which is created according to shape and dtype.
 
-    Raises:
-        TypeError: The dtype must be one of bool, float16, float32, float64, int32 and int64
-            and the data type of ``out`` must be the same as the ``dtype``. 
-        TypeError: The shape must be one of list, tuple and Tensor, the data type of ``shape``
-            must be int32 or int64 when ``shape`` is a Tensor
-
     Examples:
         .. code-block:: python
 
@@ -694,7 +684,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             attrs['str_value'] = str(float(value))
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         if out is None:
             out = _varbase_creator(dtype=dtype)
 
@@ -718,20 +708,18 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             value = cast(value, dtype)
         inputs['ValueTensor'] = value
 
+    check_shape(shape)
     check_dtype(dtype, 'dtype',
                 ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
                 'fill_constant')
     check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant')
 
-    if isinstance(shape, Variable):
-        check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'fill_constant')
-
     if out is not None:
         check_variable_and_dtype(out, 'out', [convert_dtype(dtype)],
                                  'fill_constant')
 
     helper = LayerHelper("fill_constant", **locals())
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs, attrs=attrs, shape=shape, op_type='fill_constant')
 
     if out is None:
@@ -1050,10 +1038,6 @@ def ones(shape, dtype, force_cpu=False):
 
     Returns:
         Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 1.
-    Raises:
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: The ``shape`` must be one of list, tuple and Tensor. The data type of ``shape`` must
-            be int32 or int64 when it's a Tensor.
 
     Examples:
         .. code-block:: python
@@ -1086,10 +1070,6 @@ def zeros(shape, dtype, force_cpu=False, name=None):
     Returns:
         Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 0.
 
-    Raises:
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: The ``shape`` must be one of list, tuple and Tensor. The data type of ``shape`` must
-            be int32 or int64 when it's a Tensor.
     Examples:
         .. code-block:: python
 
@@ -1453,14 +1433,6 @@ def linspace(start, stop, num, dtype=None, name=None):
         the data shape of this tensor is :math:`[num]` . If the :attr:`num` is set 1, the output tensor just has \
         the value with input :attr:`start`. 
 
-    Raises:
-        TypeError: The ``dtype`` must be one of int32, int64, float32 and float64.
-        TypeError: The type of ``num`` must be int When it's not a Tensor.
-        TypeError: The data type of ``num`` must be int32  When it's  a Tensor.
-        TypeError: The data type of ``start`` and  ``stop`` must be same as ``dtype`` When it's  a Tensor.
-
-
-
     Examples:
         .. code-block:: python
 
@@ -1474,6 +1446,8 @@ def linspace(start, stop, num, dtype=None, name=None):
     tensor_num = num
     tensor_start = start
     tensor_stop = stop
+    if not isinstance(num, Variable):
+        check_type(num, 'num', (int), 'linspace')
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
     if not isinstance(start, Variable):
@@ -1488,21 +1462,32 @@ def linspace(start, stop, num, dtype=None, name=None):
 
     helper = LayerHelper("linspace", **locals())
 
+    start_dtype = convert_dtype(tensor_start.dtype)
+    stop_dtype = convert_dtype(tensor_stop.dtype)
+    out_dtype = convert_dtype(dtype)
     if isinstance(start, Variable):
-        check_dtype(start.dtype, 'start', (convert_dtype(dtype)), 'linspace')
+        check_dtype(start.dtype, 'start',
+                    ['float32', 'float64', 'int32', 'int64'], 'linspace')
     else:
         check_type(start, 'start', (int, float), 'linspace')
 
     if isinstance(stop, Variable):
-        check_dtype(stop.dtype, 'stop', (convert_dtype(dtype)), 'linspace')
+        check_dtype(stop.dtype, 'stop',
+                    ['float32', 'float64', 'int32', 'int64'], 'linspace')
     else:
         check_type(stop, 'stop', (int, float), 'linspace')
     if isinstance(num, Variable):
         check_dtype(num.dtype, 'num', ['int32'], 'linspace')
-    else:
-        check_type(num, 'num', (int), 'linspace')
     check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
                 'linspace')
+    if ((stop_dtype == "float64" or start_dtype == "float64") and
+            out_dtype in ["float32", "int32"]) or ((stop_dtype == "int64" or
+                                                    start_dtype == "int64") and
+                                                   out_dtype == "int32"):
+        raise ValueError(
+            "The dtype of start/stop is {}/{} but the attr(dtype) of linspace is {}, "
+            "which may cause data type overflows. Please reset attr(dtype) of linspace."
+            .format(start_dtype, stop_dtype, dtype))
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
@@ -1629,9 +1614,6 @@ def eye(num_rows,
 
     Returns:
         Tensor: An identity Tensor or LoDTensor of shape batch_shape + [num_rows, num_columns].
-    Raises:
-        TypeError: The `dtype` must be one of float16, float32, float64, int32 and int64.
-        TypeError: The `num_columns` must be non-negative int.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index 0d6965239e14b92d3d4997a9cf8efbe3fa7048b7..2095c9957e75b94396e573eba341f4cfded5dbc8 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -20,6 +20,7 @@ import numpy as np
 from ..framework import Variable
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..layer_helper import LayerHelper
+from sys import version_info
 
 
 def convert_to_list(value, n, name, dtype=np.int):
@@ -282,7 +283,7 @@ def _contain_var(list_or_tuple):
     return False
 
 
-def _get_shape_tensor_inputs(inputs, attrs, shape, op_type):
+def get_shape_tensor_inputs(inputs, attrs, shape, op_type):
     from .tensor import fill_constant, cast
 
     def _get_attr_shape(list_shape):
@@ -347,7 +348,7 @@ def _convert_to_tensor_list(old_list, dtype="int32"):
     return new_list_tensor
 
 
-def _convert_shape_to_list(shape):
+def convert_shape_to_list(shape):
     """
     Convert shape(list, tuple, variable) to list in imperative mode
     """
@@ -358,3 +359,22 @@ def _convert_shape_to_list(shape):
     else:
         shape = list(shape.numpy().astype(int))
     return shape
+
+
+def check_shape(shape):
+    """
+    Check shape type and shape elements type before passing it to fill_constant
+    """
+    if isinstance(shape, Variable):
+        check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'fill_constant')
+    else:
+        for ele in shape:
+            if not isinstance(ele, Variable):
+                if ele < 0:
+                    raise ValueError(
+                        "All elements in ``shape`` must be positive when it's a list or tuple"
+                    )
+                if not isinstance(ele, six.integer_types):
+                    raise TypeError(
+                        "All elements in ``shape`` must be integers when it's a list or tuple"
+                    )
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 8f34576b836a5412a6792a6dfd63b3c9fd8de560..8b37cfef3890eace0ff5141eeb91d85e78f1c964 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -61,7 +61,7 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def __init__(self,
                  learning_rate,
                  parameter_list=None,
@@ -170,7 +170,7 @@ class Optimizer(object):
         return state_dict
 
     @framework.dygraph_only
-    def set_dict(self, state_dict):
+    def set_state_dict(self, state_dict):
         '''
         Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.
 
@@ -182,20 +182,22 @@ class Optimizer(object):
         Examples:
             .. code-block:: python
 
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
+                import paddle   
+
+                paddle.disable_static()
+
+                emb = paddle.nn.Embedding([10, 10])
 
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph(state_dict, "paddle_dy")
+                state_dict = emb.state_dict()
+                paddle.save(state_dict, "paddle_dy")
 
-                    adam = fluid.optimizer.Adam(learning_rate=fluid.layers.noam_decay( 100, 10000), 
+                adam = paddle.optimizer.Adam(learning_rate=fluid.layers.noam_decay( 100, 10000), 
                                                 parameter_list=emb.parameters())
-                    state_dict = adam.state_dict()
-                    fluid.save_dygraph(state_dict, "paddle_dy")
+                state_dict = adam.state_dict()
 
-                    para_state_dict, opti_state_dict = fluid.load_dygraph( "paddle_dy")
+                para_state_dict, opti_state_dict = paddle.load("paddle_dy")
 
-                    adam.set_dict(opti_state_dict)
+                adam.set_state_dict(opti_state_dict)
 
         '''
         from paddle.optimizer.lr_scheduler import _LRScheduler
@@ -257,6 +259,9 @@ class Optimizer(object):
 
                 tensor.set(load_para_np, framework._current_expected_place())
 
+    # [aliases] Compatible with old method names
+    set_dict = set_state_dict
+
     def get_opti_var_name_list(self):
         return self._opti_name_list
 
@@ -897,7 +902,7 @@ class Optimizer(object):
             if p.trainable:
                 p.clear_gradient()
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -1015,7 +1020,7 @@ class SGDOptimizer(Optimizer):
             name=name)
         self.type = "sgd"
 
-    @no_grad()
+    @no_grad
     def _append_optimize_op(self, block, param_and_grad):
         lr = self._create_param_lr(param_and_grad)
         if framework.in_dygraph_mode():
@@ -1552,7 +1557,7 @@ class DGCMomentumOptimizer(Optimizer):
         dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
                          [param_var.name, grad_var.name])
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def apply_gradients(self, params_grads):
         params_grads = sorted(params_grads, key=lambda x: x[0].name)
         params_grads, table_param_and_grad, table_optimize_op = \
@@ -4595,15 +4600,16 @@ class RecomputeOptimizer(Optimizer):
             ), "_checkpoints should be a list of Variable or a list of String"
         self._checkpoints = checkpoints
 
-    def load(self, stat_dict):
+    @framework.deprecate_stat_dict
+    def load(self, state_dict):
         """
-	:api_attr: Static Graph
+	    :api_attr: Static Graph
 
         load function is not supported by Recompute Optimizer for now.
         :return: None
 
         Args:
-            stat_dict: the dict load by load_persistable method
+            state_dict: the dict load by load_persistable method
 
         Examples:
             .. code-block:: python
@@ -4627,8 +4633,8 @@ class RecomputeOptimizer(Optimizer):
                 sgd = fluid.optimizer.RecomputeOptimizer(sgd)
                 sgd._set_checkpoints([fc_1, pred])
                 try:
-                    stat_dict = {}
-                    sgd.load(stat_dict)
+                    state_dict = {}
+                    sgd.load(state_dict)
                 except NotImplementedError as e:
                     print(cpt.get_exception_message(e))
         """
diff --git a/python/paddle/fluid/tests/demo/executor_train_dataset.py b/python/paddle/fluid/tests/demo/executor_train_dataset.py
deleted file mode 100644
index 6938982de725c296aae29e70d0640749d0876353..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/executor_train_dataset.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tarfile
-import paddle.fluid as fluid
-import paddle
-from paddle.fluid import core
-
-URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
-MD5 = '2a405a31508969b3ab823f42c0f522ca'
-
-
-def bow_net(data,
-            label,
-            dict_dim=89528,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    BOW net
-    This model is from https://github.com/PaddlePaddle/models:
-    models/fluid/PaddleNLP/text_classification/nets.py
-    """
-    # embedding
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bowh = fluid.layers.tanh(bow)
-    # fc layer after conv
-    fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    # probability of each class
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    # cross entropy loss
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    # mean loss
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, acc, prediction
-
-
-def train():
-    # Download data
-    with tarfile.open(paddle.dataset.common.download(URL, "imdb", MD5)) as tarf:
-        tarf.extractall(path='./')
-        tarf.close()
-
-    # Initialize dataset description
-    dataset = fluid.DatasetFactory().create_dataset()
-    dataset.set_batch_size(128)  # See API doc for how to change other fields
-
-    # define network
-    # input text data
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    # label data
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    dataset.set_use_var([data, label])
-    avg_cost, acc, prediction = bow_net(data, label)
-    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
-    opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)
-
-    # Run startup program
-    startup_program = fluid.default_startup_program()
-    place = fluid.CPUPlace()
-    executor = fluid.Executor(place)
-    executor.run(startup_program)
-
-    main_program = fluid.default_main_program()
-    epochs = 10
-    filelist = ["train_data/part-%d" % i for i in range(12)]
-    dataset.set_filelist(filelist)
-    for i in range(epochs):
-        dataset.set_thread(4)
-        executor.train_from_dataset(
-            main_program,  # This can be changed during iteration
-            dataset,  # This can be changed during iteration
-            debug=False)
-        fluid.io.save_inference_model('imdb/epoch%d.model' % i,
-                                      [data.name, label.name], [acc], executor)
-
-
-if __name__ == "__main__":
-    train()
diff --git a/python/paddle/fluid/tests/demo/fc_gan.py b/python/paddle/fluid/tests/demo/fc_gan.py
deleted file mode 100644
index bd77779ce6ab5cf19e3e5ace3e51e39734b27c10..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ /dev/null
@@ -1,173 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import errno
-import math
-import os
-
-import matplotlib
-import numpy
-
-import paddle
-import paddle.fluid as fluid
-
-matplotlib.use('Agg')
-import matplotlib.pyplot as plt
-import matplotlib.gridspec as gridspec
-
-NOISE_SIZE = 100
-NUM_PASS = 1000
-NUM_REAL_IMGS_IN_BATCH = 121
-NUM_TRAIN_TIMES_OF_DG = 3
-LEARNING_RATE = 2e-5
-
-
-def D(x):
-    hidden = fluid.layers.fc(input=x,
-                             size=200,
-                             act='relu',
-                             param_attr='D.w1',
-                             bias_attr='D.b1')
-    logits = fluid.layers.fc(input=hidden,
-                             size=1,
-                             act=None,
-                             param_attr='D.w2',
-                             bias_attr='D.b2')
-    return logits
-
-
-def G(x):
-    hidden = fluid.layers.fc(input=x,
-                             size=200,
-                             act='relu',
-                             param_attr='G.w1',
-                             bias_attr='G.b1')
-    img = fluid.layers.fc(input=hidden,
-                          size=28 * 28,
-                          act='tanh',
-                          param_attr='G.w2',
-                          bias_attr='G.b2')
-    return img
-
-
-def plot(gen_data):
-    gen_data.resize(gen_data.shape[0], 28, 28)
-    n = int(math.ceil(math.sqrt(gen_data.shape[0])))
-    fig = plt.figure(figsize=(n, n))
-    gs = gridspec.GridSpec(n, n)
-    gs.update(wspace=0.05, hspace=0.05)
-
-    for i, sample in enumerate(gen_data):
-        ax = plt.subplot(gs[i])
-        plt.axis('off')
-        ax.set_xticklabels([])
-        ax.set_yticklabels([])
-        ax.set_aspect('equal')
-        plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
-
-    return fig
-
-
-def main():
-    try:
-        os.makedirs("./out")
-    except OSError as e:
-        if e.errno != errno.EEXIST:
-            raise
-
-    startup_program = fluid.Program()
-    d_program = fluid.Program()
-    dg_program = fluid.Program()
-
-    with fluid.program_guard(d_program, startup_program):
-        img = fluid.layers.data(name='img', shape=[784], dtype='float32')
-        d_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-            x=D(img),
-            label=fluid.layers.data(
-                name='label', shape=[1], dtype='float32'))
-        d_loss = fluid.layers.mean(d_loss)
-
-    with fluid.program_guard(dg_program, startup_program):
-        noise = fluid.layers.data(
-            name='noise', shape=[NOISE_SIZE], dtype='float32')
-        g_img = G(x=noise)
-        g_program = dg_program.clone()
-        dg_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-            x=D(g_img),
-            label=fluid.layers.fill_constant_batch_size_like(
-                input=noise, dtype='float32', shape=[-1, 1], value=1.0))
-        dg_loss = fluid.layers.mean(dg_loss)
-
-    opt = fluid.optimizer.Adam(learning_rate=LEARNING_RATE)
-
-    opt.minimize(loss=d_loss, startup_program=startup_program)
-    opt.minimize(
-        loss=dg_loss,
-        startup_program=startup_program,
-        parameter_list=[
-            p.name for p in g_program.global_block().all_parameters()
-        ])
-    exe = fluid.Executor(fluid.CPUPlace())
-    exe.run(startup_program)
-
-    num_true = NUM_REAL_IMGS_IN_BATCH
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=60000),
-        batch_size=num_true)
-
-    for pass_id in range(NUM_PASS):
-        for batch_id, data in enumerate(train_reader()):
-            num_true = len(data)
-            n = numpy.random.uniform(
-                low=-1.0, high=1.0,
-                size=[num_true * NOISE_SIZE]).astype('float32').reshape(
-                    [num_true, NOISE_SIZE])
-            generated_img = exe.run(g_program,
-                                    feed={'noise': n},
-                                    fetch_list={g_img})[0]
-            real_data = numpy.array([x[0] for x in data]).astype('float32')
-            real_data = real_data.reshape(num_true, 784)
-            total_data = numpy.concatenate([real_data, generated_img])
-            total_label = numpy.concatenate([
-                numpy.ones(
-                    shape=[real_data.shape[0], 1], dtype='float32'),
-                numpy.zeros(
-                    shape=[real_data.shape[0], 1], dtype='float32')
-            ])
-            d_loss_np = exe.run(d_program,
-                                feed={'img': total_data,
-                                      'label': total_label},
-                                fetch_list={d_loss})[0]
-            for _ in range(NUM_TRAIN_TIMES_OF_DG):
-                n = numpy.random.uniform(
-                    low=-1.0, high=1.0,
-                    size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape(
-                        [2 * num_true, NOISE_SIZE, 1, 1])
-                dg_loss_np = exe.run(dg_program,
-                                     feed={'noise': n},
-                                     fetch_list={dg_loss})[0]
-            print("Pass ID={0}, Batch ID={1}, D-Loss={2}, DG-Loss={3}".format(
-                pass_id, batch_id, d_loss_np, dg_loss_np))
-        # generate image each batch
-        fig = plot(generated_img)
-        plt.savefig(
-            'out/{0}.png'.format(str(pass_id).zfill(3)), bbox_inches='tight')
-        plt.close(fig)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/fluid/tests/demo/pipeline_train.py b/python/paddle/fluid/tests/demo/pipeline_train.py
deleted file mode 100644
index 2f75908a160fd3c61c743dc407095d645737a534..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/pipeline_train.py
+++ /dev/null
@@ -1,205 +0,0 @@
-#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import numpy as np
-import copy
-import pickle
-import os
-from functools import partial
-import logging
-import time
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import argparse
-import random
-import sys
-import math
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-
-is_profile = False
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("Resnet with pipelie parallel.")
-    parser.add_argument(
-        '--batch_size', type=int, default=100, help='input batch size')
-    parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
-    return parser.parse_args()
-
-
-def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
-                  act=None):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=(filter_size - 1) // 2,
-        groups=groups,
-        act=None,
-        bias_attr=False)
-    return fluid.layers.batch_norm(
-        input=conv,
-        act=act, )
-
-
-def shortcut(input, ch_out, stride, is_first):
-    ch_in = input.shape[1]
-    if ch_in != ch_out or stride != 1 or is_first == True:
-        return conv_bn_layer(input, ch_out, 1, stride)
-    else:
-        return input
-
-
-def bottleneck_block(input, num_filters, stride):
-    conv0 = conv_bn_layer(
-        input=input, num_filters=num_filters, filter_size=1, act='relu')
-    conv1 = conv_bn_layer(
-        input=conv0,
-        num_filters=num_filters,
-        filter_size=3,
-        stride=stride,
-        act='relu')
-    conv2 = conv_bn_layer(
-        input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
-
-    short = shortcut(input, num_filters * 4, stride, is_first=False)
-
-    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-
-
-def basic_block(input, num_filters, stride, is_first):
-    conv0 = conv_bn_layer(
-        input=input,
-        num_filters=num_filters,
-        filter_size=3,
-        act='relu',
-        stride=stride)
-    conv1 = conv_bn_layer(
-        input=conv0, num_filters=num_filters, filter_size=3, act=None)
-    short = shortcut(input, num_filters, stride, is_first)
-    return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
-
-
-def network(input, layers=50, class_dim=1000):
-    supported_layers = [18, 34, 50, 101, 152]
-    assert layers in supported_layers
-    depth = None
-    if layers == 18:
-        depth = [2, 2, 2, 2]
-    elif layers == 34 or layers == 50:
-        depth = [3, 4, 6, 3]
-    elif layers == 101:
-        depth = [3, 4, 23, 3]
-    elif layers == 152:
-        depth = [3, 8, 36, 3]
-    num_filters = [64, 128, 256, 512]
-    with fluid.device_guard("gpu:0"):
-        conv = conv_bn_layer(
-            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
-        conv = fluid.layers.pool2d(
-            input=conv,
-            pool_size=3,
-            pool_stride=2,
-            pool_padding=1,
-            pool_type='max')
-    if layers >= 50:
-        for block in range(len(depth)):
-            with fluid.device_guard("gpu:1"):
-                for i in range(depth[block]):
-                    conv = bottleneck_block(
-                        input=conv,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1)
-
-        with fluid.device_guard("gpu:2"):
-            pool = fluid.layers.pool2d(
-                input=conv, pool_size=7, pool_type='avg', global_pooling=True)
-            stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
-            out = fluid.layers.fc(
-                input=pool,
-                size=class_dim,
-                param_attr=fluid.param_attr.ParamAttr(
-                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
-    else:
-        for block in range(len(depth)):
-            with fluid.device_guard("gpu:1"):
-                for i in range(depth[block]):
-                    conv = basic_block(
-                        input=conv,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        is_first=block == i == 0)
-        with fluid.device_guard("gpu:2"):
-            pool = fluid.layers.pool2d(
-                input=conv, pool_size=7, pool_type='avg', global_pooling=True)
-            stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
-            out = fluid.layers.fc(
-                input=pool,
-                size=class_dim,
-                param_attr=fluid.param_attr.ParamAttr(
-                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
-    return out
-
-
-def train():
-    args = parse_args()
-    lr = args.lr
-
-    with fluid.device_guard("gpu:0"):
-        image = fluid.layers.data(
-            name="image", shape=[3, 224, 224], dtype="float32")
-        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-        data_loader = fluid.io.DataLoader.from_generator(
-            feed_list=[image, label],
-            capacity=64,
-            use_double_buffer=True,
-            iterable=False)
-        fc = build_network(image, layers=50)
-
-    with fluid.device_guard("gpu:3"):
-        out, prob = fluid.layers.softmax_with_cross_entropy(
-            logits=fc, label=label, return_softmax=True)
-        loss = fluid.layers.mean(out)
-        acc_top1 = fluid.layers.accuracy(input=prob, label=label, k=1)
-        acc_top5 = fluid.layers.accuracy(input=prob, label=label, k=5)
-
-    optimizer = fluid.optimizer.SGD(lr)
-    optimizer = fluid.optimizer.PipelineOptimizer(optimizer, num_microbatches=2)
-    optimizer.minimize(loss)
-
-    def train_reader():
-        for _ in range(4000):
-            img = np.random.random(size=[3, 224, 224]).astype('float32')
-            label = np.random.random(size=[1]).astype('int64')
-            yield img, label
-
-    data_loader.set_sample_generator(train_reader, batch_size=args.batch_size)
-
-    place = fluid.CUDAPlace(0)
-    exe = fluid.Executor(place)
-
-    exe.run(fluid.default_startup_program())
-
-    data_loader.start()
-    logger.info("begin training...")
-    exe.train_from_dataset(fluid.default_main_program(), debug=is_profile)
-
-
-if __name__ == "__main__":
-    train()
diff --git a/python/paddle/fluid/tests/demo/pyreader.py b/python/paddle/fluid/tests/demo/pyreader.py
deleted file mode 100644
index 6995346ffa61ea65119930296be2fba5a10c5451..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/pyreader.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy
-import six
-
-import paddle
-import paddle.dataset.mnist as mnist
-import paddle.fluid as fluid
-
-
-def network(is_train):
-    reader = fluid.layers.py_reader(
-        capacity=10,
-        shapes=((-1, 784), (-1, 1)),
-        dtypes=('float32', 'int64'),
-        name="train_reader" if is_train else "test_reader",
-        use_double_buffer=True)
-    img, label = fluid.layers.read_file(reader)
-
-    hidden = img
-
-    for i in six.moves.xrange(2):
-        hidden = fluid.layers.fc(input=hidden, size=100, act='tanh')
-        hidden = fluid.layers.dropout(
-            hidden, dropout_prob=0.5, is_test=not is_train)
-
-    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    return fluid.layers.mean(loss), reader
-
-
-def main():
-    train_prog = fluid.Program()
-    startup_prog = fluid.Program()
-
-    with fluid.program_guard(train_prog, startup_prog):
-        with fluid.unique_name.guard():
-            loss, train_reader = network(True)
-            adam = fluid.optimizer.Adam(learning_rate=0.01)
-            adam.minimize(loss)
-
-    test_prog = fluid.Program()
-    test_startup = fluid.Program()
-    with fluid.program_guard(test_prog, test_startup):
-        with fluid.unique_name.guard():
-            test_loss, test_reader = network(False)
-
-    use_cuda = fluid.core.is_compiled_with_cuda()
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    fluid.Executor(place).run(startup_prog)
-    fluid.Executor(place).run(test_startup)
-
-    trainer = fluid.ParallelExecutor(
-        use_cuda=use_cuda, loss_name=loss.name, main_program=train_prog)
-
-    tester = fluid.ParallelExecutor(
-        use_cuda=use_cuda, share_vars_from=trainer, main_program=test_prog)
-
-    train_reader.decorate_paddle_reader(
-        paddle.reader.shuffle(
-            paddle.batch(mnist.train(), 512), buf_size=8192))
-
-    test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
-
-    for epoch_id in six.moves.xrange(10):
-        train_reader.start()
-        try:
-            while True:
-                print(
-                    'train_loss',
-                    numpy.array(trainer.run(fetch_list=[loss.name])))
-        except fluid.core.EOFException:
-            print('End of epoch', epoch_id)
-            train_reader.reset()
-
-        test_reader.start()
-        try:
-            while True:
-                print(
-                    'test loss',
-                    numpy.array(tester.run(fetch_list=[test_loss.name])))
-        except fluid.core.EOFException:
-            print('End of testing')
-            test_reader.reset()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6220bf62c79c30737f923e744d5670818f54ff6e..a25cba029dd8bac81d6b00c1d9fb710f421ce9d0 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -432,8 +432,6 @@ if(WITH_DISTRIBUTE)
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_lars")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_train")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_save_load")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_simnet_bow")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_text_classification")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_train")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_word2vec")
@@ -587,8 +585,10 @@ endif()
 
 # setting timeout value for old unittests
 # set_tests_properties(test_dist_fleet_sparse_embedding_ctr PROPERTIES TIMEOUT 200)
-set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 150)
-set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
-set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
-set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
-set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
+if(NOT WIN32 AND NOT APPLE)
+    set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 150)
+    set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
+    set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
+    set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
index 863c001f226f86384e2820cb6877ded48cffa119..15e98481c26b20de4e9fa493fa022380ba1fcd63 100644
--- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
@@ -153,7 +153,7 @@ def gen_fake_line(dnn_data_num=7,
     return line
 
 
-def prepare_fake_data(file_nums=8, file_lines=1000):
+def prepare_fake_data(file_nums=9, file_lines=1000):
     """
     Create fake data with same type as avazu_ctr_data
     """
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 73b546b95cfeb8032c6e99eabe24c883d1f5f66c..dc39472d7aed8f52ee3bb0f85a5e503db9093070 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -196,8 +196,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         fleet.stop_worker()
 
     def do_dataset_training(self, fleet):
-        dnn_input_dim, lr_input_dim, train_file_path = ctr_dataset_reader.prepare_data(
-        )
+        train_file_list = ctr_dataset_reader.prepare_fake_data()
 
         exe = fluid.Executor(fluid.CPUPlace())
 
@@ -206,9 +205,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
 
         thread_num = 2
         batch_size = 128
-        filelist = []
-        for _ in range(thread_num):
-            filelist.append(train_file_path)
+        filelist = train_file_list
 
         # config dataset
         dataset = paddle.distributed.fleet.DatasetFactory().create_dataset()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index 0de898d6dde217ec6d5cdf53611f986f7b04863f..7a4e7534f07391956cd94577847c8a8f77895818 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -177,7 +177,7 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
         fleet.init_worker()
         exe.run(fluid.default_startup_program())
 
-        thread_num = 1
+        thread_num = int(os.getenv("CPU_NUM", 2))
         batch_size = 128
         filelist = fleet_util.get_file_shard(train_file_list)
         print("filelist: {}".format(filelist))
diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
similarity index 55%
rename from python/paddle/fluid/tests/unittests/dist_simnet_bow.py
rename to python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
index 9fcba2aede1cea3c78108e7daa8eb34a1ab80048..7d5ca4fc6e3916eab29942c85e88664f60cbf032 100644
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -19,6 +19,8 @@ import argparse
 import time
 import math
 import random
+import shutil
+import tempfile
 
 import paddle
 import paddle.fluid as fluid
@@ -29,7 +31,8 @@ from multiprocessing import Process
 import os
 import signal
 from functools import reduce
-from test_dist_base import TestDistRunnerBase, runtime_main
+from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
+from paddle.distributed.fleet.base.util_factory import fleet_util
 
 DTYPE = "int64"
 DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000'
@@ -49,6 +52,18 @@ fluid.default_startup_program().random_seed = 1
 fluid.default_main_program().random_seed = 1
 
 
+def fake_simnet_reader():
+    def reader():
+        for _ in range(1000):
+            q = np.random.random_integers(0, 1500 - 1, size=1).tolist()
+            label = np.random.random_integers(0, 1, size=1).tolist()
+            pt = np.random.random_integers(0, 1500 - 1, size=1).tolist()
+            nt = np.random.random_integers(0, 1500 - 1, size=1).tolist()
+            yield [q, label, pt, nt]
+
+    return reader
+
+
 def get_acc(cos_q_nt, cos_q_pt, batch_size):
     cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
     cond = fluid.layers.cast(cond, dtype='float64')
@@ -75,34 +90,40 @@ def get_loss(cos_q_pt, cos_q_nt):
     return avg_cost
 
 
-def get_optimizer(op="sgd"):
-    if op.upper() == "sgd".upper():
-        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
-    elif op.upper() == "adam".upper():
-        optimizer = fluid.optimizer.Adam(learning_rate=base_lr)
-    else:
-        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
-    return optimizer
-
-
 def train_network(batch_size,
                   is_distributed=False,
                   is_sparse=False,
-                  is_self_contained_lr=False):
+                  is_self_contained_lr=False,
+                  is_pyreader=False):
     # query
     q = fluid.layers.data(
         name="query_ids", shape=[1], dtype="int64", lod_level=1)
+    # label data
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    # pt
+    pt = fluid.layers.data(
+        name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+    # nt
+    nt = fluid.layers.data(
+        name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+
+    datas = [q, label, pt, nt]
+
+    reader = None
+    if is_pyreader:
+        reader = fluid.io.PyReader(
+            feed_list=datas,
+            capacity=64,
+            iterable=False,
+            use_double_buffer=False)
+
     # embedding
     q_emb = fluid.embedding(
         input=q,
         is_distributed=is_distributed,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__emb__",
-            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
-                name="__emb__"),
+            initializer=fluid.initializer.Constant(value=0.01), name="__emb__"),
         is_sparse=is_sparse)
     q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
     # vsum
@@ -115,12 +136,8 @@ def train_network(batch_size,
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__q_fc__",
-            learning_rate=base_lr))
-    # label data
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    # pt
-    pt = fluid.layers.data(
-        name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+            learning_rate=base_lr), )
+
     # embedding
     pt_emb = fluid.embedding(
         input=pt,
@@ -129,9 +146,7 @@ def train_network(batch_size,
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__emb__",
-            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
-                name="__emb__"),
+            learning_rate=emb_lr),
         is_sparse=is_sparse)
     pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
     # vsum
@@ -142,24 +157,16 @@ def train_network(batch_size,
         input=pt_ss,
         size=hid_dim,
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__fc__",
-            learning_rate=base_lr),
+            initializer=fluid.initializer.Constant(value=0.01), name="__fc__"),
         bias_attr=fluid.ParamAttr(name="__fc_b__"))
-    # nt
-    nt = fluid.layers.data(
-        name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+
     # embedding
     nt_emb = fluid.embedding(
         input=nt,
         is_distributed=is_distributed,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__emb__",
-            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
-                name="__emb__"),
+            initializer=fluid.initializer.Constant(value=0.01), name="__emb__"),
         is_sparse=is_sparse)
     nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
     # vsum
@@ -170,9 +177,7 @@ def train_network(batch_size,
         input=nt_ss,
         size=hid_dim,
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__fc__",
-            learning_rate=base_lr),
+            initializer=fluid.initializer.Constant(value=0.01), name="__fc__"),
         bias_attr=fluid.ParamAttr(name="__fc_b__"))
     cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
     cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
@@ -180,79 +185,67 @@ def train_network(batch_size,
     avg_cost = get_loss(cos_q_pt, cos_q_nt)
     # acc
     acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
-    return [avg_cost, acc, cos_q_pt]
-
-
-def combination(x, y):
-    res = [[[xi, yi] for yi in y] for xi in x]
-    return res[0]
-
-
-def get_one_data(file_list):
-    for file in file_list:
-        contents = []
-        with open(file, "r") as fin:
-            for i in fin:
-                contents.append(i.strip())
-            for index, q in enumerate(contents):
-                try:
-                    one_data = [[int(j) for j in i.split(" ")]
-                                for i in q.split(";")[:-1]]
-                    if one_data[1][0] + one_data[1][1] != len(one_data) - 3:
-                        q = fin.readline()
-                        continue
-                    tmp = combination(one_data[3:3 + one_data[1][0]],
-                                      one_data[3 + one_data[1][0]:])
-                except Exception as e:
-                    continue
-
-                for each in tmp:
-                    yield [one_data[2], 0, each[0], each[1]]
-
-
-def get_batch_reader(file_list, batch_size):
-    def batch_reader():
-        res = []
-        for i in get_one_data(file_list):
-            if random.random() <= sample_rate:
-                res.append(i)
-            if len(res) >= batch_size:
-                yield res
-                res = []
-
-    return batch_reader
-
-
-def get_train_reader(batch_size):
-    # The training data set.
-    train_file = os.path.join(paddle.dataset.common.DATA_HOME, "simnet",
-                              "train")
-    train_reader = get_batch_reader([train_file], batch_size)
-    train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"]
-    return train_reader, train_feed
-
-
-class TestDistSimnetBow2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2):
-        # Train program
-        avg_cost, acc, predict = \
-            train_network(batch_size,
-                          bool(int(os.environ["IS_DISTRIBUTED"])),
-                          bool(int(os.environ["IS_SPARSE"])),
-                          bool(int(os.environ["IS_SELF_CONTAINED_LR"])))
-
-        inference_program = fluid.default_main_program().clone()
-
-        # Optimization
-        opt = os.getenv('OPTIMIZER', 'sgd')
-        opt = get_optimizer(opt)
-        opt.minimize(avg_cost)
-
-        # Reader
-        train_reader, _ = get_train_reader(batch_size)
-        return inference_program, avg_cost, train_reader, train_reader, acc, predict
+    return avg_cost, acc, cos_q_pt, reader
+
+
+class TestDistSimnetBow2x2(FleetDistRunnerBase):
+    """
+    For test SimnetBow model, use Fleet api
+    """
+
+    def net(self, args, batch_size=4, lr=0.01):
+        avg_cost, _, predict, self.reader = \
+            train_network(batch_size=batch_size, is_distributed=False,
+                               is_sparse=True, is_self_contained_lr=False, is_pyreader=(args.reader == "pyreader"))
+        self.avg_cost = avg_cost
+        self.predict = predict
+
+        return avg_cost
+
+    def check_model_right(self, dirname):
+        model_filename = os.path.join(dirname, "__model__")
+
+        with open(model_filename, "rb") as f:
+            program_desc_str = f.read()
+
+        program = fluid.Program.parse_from_string(program_desc_str)
+        with open(os.path.join(dirname, "__model__.proto"), "w") as wn:
+            wn.write(str(program))
+
+    def do_pyreader_training(self, fleet):
+        """
+        do training using dataset, using fetch handler to catch variable
+        Args:
+            fleet(Fleet api): the fleet object of Parameter Server, define distribute training role
+        """
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        fleet.init_worker()
+        exe.run(fluid.default_startup_program())
+        batch_size = 4
+        # reader
+        train_reader = paddle.batch(fake_simnet_reader(), batch_size=batch_size)
+        self.reader.decorate_sample_list_generator(train_reader)
+        for epoch_id in range(1):
+            self.reader.start()
+            try:
+                pass_start = time.time()
+                while True:
+                    loss_val = exe.run(program=fluid.default_main_program(),
+                                       fetch_list=[self.avg_cost.name])
+                    loss_val = np.mean(loss_val)
+                    message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
+                                                                      loss_val)
+                    fleet_util.print_on_rank(message, 0)
+
+                pass_time = time.time() - pass_start
+            except fluid.core.EOFException:
+                self.reader.reset()
+        fleet.stop_worker()
+
+    def do_dataset_training(self, fleet):
+        pass
 
 
 if __name__ == "__main__":
-    paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
     runtime_main(TestDistSimnetBow2x2)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
index 75bff108dd43665df0fc1c8b166a935946b4fbc7..ba0adaf32e15db71162aed71c042100a0cd50e26 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
@@ -81,7 +81,7 @@ class PredictorTools(object):
                 tensor.set_lod(feed_data.lod())
 
         # ensure no diff in multiple repeat times
-        repeat_time = 10
+        repeat_time = 2
         for i in range(repeat_time):
             predictor.zero_copy_run()
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
index 3e6fe168b8eaf39286c518c8b4a2ad6d48b0e6bb..29b4f1b05f9c2911b849b323674b3a704a1da297 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
@@ -19,9 +19,11 @@ import unittest
 import inspect
 import gast
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.dygraph as dygraph
 
+from paddle import to_tensor
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph.jit import dygraph_to_static_func
 from paddle.fluid.dygraph.dygraph_to_static.utils import is_dygraph_api
@@ -45,11 +47,19 @@ def dyfunc_to_variable_3(x):
     return res
 
 
+def dyfunc_to_tensor(x):
+    res1 = paddle.to_tensor(x, dtype=None, place=None, stop_gradient=True)
+    res2 = paddle.tensor.to_tensor(data=res1)
+    res3 = to_tensor(data=res2)
+    return res3
+
+
 class TestDygraphBasicApi_ToVariable(unittest.TestCase):
     def setUp(self):
         self.input = np.ones(5).astype("int32")
         self.test_funcs = [
-            dyfunc_to_variable, dyfunc_to_variable_2, dyfunc_to_variable_3
+            dyfunc_to_tensor, dyfunc_to_variable, dyfunc_to_variable_2,
+            dyfunc_to_variable_3
         ]
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
index 8e35dd78457bb59bb4882bc1deeb23539f47012a..b72149a29c73ff9e1fa1975c3caffebb6202e0b7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
@@ -123,7 +123,7 @@ class TestConvertWithCache(unittest.TestCase):
 
 
 @declarative
-def sum_even_util_limit(max_len, limit):
+def sum_even_until_limit(max_len, limit):
     ret_sum = fluid.dygraph.to_variable(np.zeros((1)).astype('int32'))
     for i in range(max_len):
         if i % 2 > 0:
@@ -147,7 +147,7 @@ def sum_under_while(limit):
 class TestToOutputWithCache(unittest.TestCase):
     def test_output(self):
         with fluid.dygraph.guard():
-            ret = sum_even_util_limit(80, 10)
+            ret = sum_even_until_limit(80, 10)
             self.assertEqual(ret.numpy(), 30)
 
             ret = declarative(sum_under_while)(100)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index e5a33e59a3b97cc06c49247f8b7ab97f92240d26..949286f63efb3357325f25b02f60e938eebd28e8 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -13,13 +13,15 @@
 # limitations under the License.
 
 import numpy as np
+import unittest
+
 import paddle
-from paddle.static import InputSpec
 import paddle.fluid as fluid
+from paddle.static import InputSpec
 from paddle.fluid.dygraph import to_variable, declarative, ProgramTranslator, Layer, jit
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ConcreteProgram
 
-import unittest
+from test_basic_api_transformation import dyfunc_to_variable
 
 program_trans = ProgramTranslator()
 
@@ -181,6 +183,9 @@ def foo_func(a, b, c=1, d=2):
 
 
 class TestDifferentInputSpecCacheProgram(unittest.TestCase):
+    def setUp(self):
+        program_trans.enable(True)
+
     def test_with_different_input(self):
         with fluid.dygraph.guard(fluid.CPUPlace()):
             x_data = np.ones([16, 10]).astype('float32')
@@ -272,5 +277,23 @@ class TestDifferentInputSpecCacheProgram(unittest.TestCase):
                 foo_3.concrete_program
 
 
+class TestDeclarativeAPI(unittest.TestCase):
+    def test_error(self):
+        func = declarative(dyfunc_to_variable)
+
+        paddle.enable_static()
+
+        # Failed to run the callable object decorated by '@paddle.jit.to_static'
+        # if it does NOT in dynamic mode.
+        with self.assertRaises(RuntimeError):
+            func(np.ones(5).astype("int32"))
+
+        program_trans.enable(False)
+        with self.assertRaises(AssertionError):
+            # AssertionError: We Only support to_variable in imperative mode,
+            #  please use fluid.dygraph.guard() as context to run it in imperative Mode
+            func(np.ones(5).astype("int32"))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
index 214cd95d3bc620b3bcadb88e57c7e54a593eaaf4..510b615654751500c33dc3311353ba7e2f8baf40 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -86,11 +86,11 @@ class TestLoggingUtils(unittest.TestCase):
             with mock.patch.object(sys, 'stdout', stream):
                 logging_utils.warn(warn_msg)
                 logging_utils.error(error_msg)
-                self.translator_logger.verbosity_level = 2
+                self.translator_logger.verbosity_level = 1
                 logging_utils.log(1, log_msg_1)
                 logging_utils.log(2, log_msg_2)
 
-            result_msg = '\n'.join([warn_msg, error_msg, log_msg_2, ""])
+            result_msg = '\n'.join([warn_msg, error_msg, log_msg_1, ""])
             self.assertEqual(result_msg, stream.getvalue())
 
     def test_log_transformed_code(self):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index 1ef3bd1bf150056816283c83fa3ff6af1e589732..bd600d2f2dbd6341ff7a83d6636047d01cae7859 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -154,6 +154,18 @@ class TestMNISTWithToStatic(TestMNIST):
             msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
                                                             static_loss))
 
+    def test_mnist_declarative_cpu_vs_mkldnn(self):
+        dygraph_loss_cpu = self.train_dygraph()
+        fluid.set_flags({'FLAGS_use_mkldnn': True})
+        try:
+            dygraph_loss_mkldnn = self.train_dygraph()
+        finally:
+            fluid.set_flags({'FLAGS_use_mkldnn': False})
+        self.assertTrue(
+            np.allclose(dygraph_loss_cpu, dygraph_loss_mkldnn),
+            msg='cpu dygraph is {}\n mkldnn dygraph is \n{}'.format(
+                dygraph_loss_cpu, dygraph_loss_mkldnn))
+
     def train(self, to_static=False):
         prog_trans = ProgramTranslator()
         prog_trans.enable(to_static)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 6556b2f03bd5304e290792d07d1d969ab255bfdc..203c8ddb3488c0fef9a0a590378505e5b61233cf 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -346,6 +346,13 @@ class TestResnet(unittest.TestCase):
                                                              dygraph_loss))
         self.verify_predict()
 
+    def test_in_static_mode_mkldnn(self):
+        fluid.set_flags({'FLAGS_use_mkldnn': True})
+        try:
+            train(to_static=True)
+        finally:
+            fluid.set_flags({'FLAGS_use_mkldnn': False})
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/launch_function_helper.py b/python/paddle/fluid/tests/unittests/launch_function_helper.py
index 13041827ffeabd3d6b79e4f34a67bd09624e54f6..046268444018799ca4d7f5530cbb6b1c707e062f 100644
--- a/python/paddle/fluid/tests/unittests/launch_function_helper.py
+++ b/python/paddle/fluid/tests/unittests/launch_function_helper.py
@@ -15,7 +15,8 @@ from multiprocessing import Pool, Process
 import os
 import socket
 from contextlib import closing
-import psutil
+import time
+import sys
 
 
 def launch_func(func, env_dict):
@@ -25,19 +26,36 @@ def launch_func(func, env_dict):
     return proc
 
 
-def wait(procs, timeout=None):
-    # wait
-    decents = []
+def wait(procs, timeout=30):
+    error = False
+    begin = time.time()
+    while True:
+        alive = False
+        for p in procs:
+            p.join(timeout=10)
+            if p.exitcode is None:
+                alive = True
+                continue
+            elif p.exitcode != 0:
+                error = True
+                break
+
+        if not alive:
+            break
+
+        if error:
+            break
+
+        if timeout is not None and time.time() - begin >= timeout:
+            error = True
+            break
+
     for p in procs:
-        for child in psutil.Process(p.pid).children(recursive=True):
-            decents.append(child)
-
-    gone, alive = psutil.wait_procs(decents, timeout=timeout)
-    for p in alive:
-        p.kill()
-    for p in gone:
-        if p.returncode != 0:
-            sys.exit(1)
+        if p.is_alive():
+            p.terminate()
+
+    if error:
+        sys.exit(1)
 
 
 def _find_free_port(port_set):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f5715a0d0afcf59ebbe1cc95a6b06dead64c6e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+import os
+from paddle.fluid.layer_helper import LayerHelper
+
+
+def check():
+    print("check: fluid.core.globals()['FLAGS_use_mkldnn']=",
+          fluid.core.globals()["FLAGS_use_mkldnn"])
+    print("check: fluid.get_flags('FLAGS_use_mkldnn')=",
+          fluid.get_flags(['FLAGS_use_mkldnn']))
+    print("check: DNNL_VERBOSE=", os.environ['DNNL_VERBOSE'])
+    a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
+    helper = LayerHelper(fluid.unique_name.generate(str("test")), act="relu")
+    func = helper.append_activation
+    with fluid.dygraph.guard(fluid.core.CPUPlace()):
+        a = fluid.dygraph.to_variable(a_np)
+        res1 = func(a)
+        res2 = np.maximum(a_np, 0)
+    assert (np.array_equal(res1.numpy(), res2))
+
+
+if __name__ == '__main__':
+    try:
+        check()
+        for k, v in sorted(os.environ.items()):
+            print(k + ':', v)
+        print('\n')
+    except Exception as e:
+        print(e)
+        print(type(e))
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py b/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..69676d0d70bdd523652c30c4cf066dc6982c46d4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import unittest
+import os
+import sys
+import subprocess
+
+
+class TestFlagsUseMkldnn(unittest.TestCase):
+    def setUp(self):
+        self._python_interp = sys.executable
+        self._python_interp += " check_flags_use_mkldnn.py"
+
+        self.env = os.environ.copy()
+        self.env[str("GLOG_v")] = str("3")
+        self.env[str("DNNL_VERBOSE")] = str("1")
+        self.env[str("FLAGS_use_mkldnn")] = str("1")
+
+    def test_flags_use_mkl_dnn(self):
+        cmd = self._python_interp
+
+        proc = subprocess.Popen(
+            cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=self.env)
+
+        out, err = proc.communicate()
+        returncode = proc.returncode
+
+        print('out', out)
+        print('err', err)
+
+        assert returncode == 0
+        # in python3, type(out) is 'bytes', need use encode
+        assert out.find(
+            "dnnl_verbose,exec,cpu,eltwise,jit:avx512_common,forward_training,"
+            "data_f32::blocked:abc:f0 diff_undef::undef::f0,,alg:eltwise_relu".
+            encode()) != -1
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
index aff13f0b555299d1c7b453b61be79f5a356a5416..b083e76897cd96cea93d7b90898541de1226ac15 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
@@ -114,8 +114,8 @@ class TestMnist(TestParallelDyGraphRunnerBase):
         model = MNIST()
         train_reader = paddle.batch(
             paddle.dataset.mnist.train(), batch_size=2, drop_last=True)
-        opt = fluid.optimizer.Adam(
-            learning_rate=1e-3, parameter_list=model.parameters())
+        opt = paddle.optimizer.Adam(
+            learning_rate=1e-3, parameters=model.parameters())
         return model, train_reader, opt
 
     def run_one_loop(self, model, opt, data):
diff --git a/python/paddle/fluid/tests/unittests/simnet_dataset_reader.py b/python/paddle/fluid/tests/unittests/simnet_dataset_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..41eadc13a2ad26ac15b0623147dae5771f371a12
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/simnet_dataset_reader.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import logging
+import tarfile
+
+import random
+
+import paddle
+import paddle.fluid.incubate.data_generator as data_generator
+
+logging.basicConfig()
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+
+
+class DatasetSimnetReader(data_generator.MultiSlotDataGenerator):
+    def generate_sample(self, line):
+        pass
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index d4aafcd27a5aceb3c0b5fa9ddf8343d404bddbf5..14e83fccd655527d8f3012365e4757d23236a445 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -450,7 +450,7 @@ class TestAdamOpV2(unittest.TestCase):
 
         import paddle
         paddle.disable_static()
-        emb = paddle.nn.Embedding([10, 10])
+        emb = paddle.nn.Embedding(10, 10)
 
         adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
         state_dict = adam.state_dict()
@@ -504,6 +504,19 @@ class TestAdamOpV2(unittest.TestCase):
                 shape=[1], value=lr, dtype='float32')
             adam.set_lr(lr_var)
 
+    def test_adam_op_invalid_input(self):
+        paddle.disable_static()
+        linear = paddle.nn.Linear(10, 10)
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.Adam(
+                0.1, beta1=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.Adam(
+                0.1, beta2=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.Adam(
+                0.1, epsilon=-1, parameters=linear.parameters())
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_op.py b/python/paddle/fluid/tests/unittests/test_adamax_op.py
index a6d1be7616c73019cd8f66dcf0c108cd58ec600b..8ce7656acfae77987b284e29cd85b35d264b20e2 100644
--- a/python/paddle/fluid/tests/unittests/test_adamax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_op.py
@@ -184,5 +184,21 @@ def adamax_step(inputs, attributes):
     return param_out, moment_out, inf_norm_out
 
 
+class TestAdamaxOpV2(unittest.TestCase):
+    def test_adamax_op_invalid_input(self):
+        import paddle
+        paddle.disable_static()
+        linear = paddle.nn.Linear(10, 10)
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.Adamax(
+                0.1, beta1=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.Adamax(
+                0.1, beta2=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.Adamax(
+                0.1, epsilon=-1, parameters=linear.parameters())
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index 0a7cf54e2e0f15e51ba1b6f7526837f53c7cc2e0..cce24b57d2ca50e96e3ae0cf6d8912a8aea79a31 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -76,6 +76,19 @@ class TestAdamWOp(unittest.TestCase):
         rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
         assert rets[0] is not None
 
+    def test_adamw_op_invalid_input(self):
+        paddle.disable_static()
+        linear = paddle.nn.Linear(10, 10)
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.AdamW(
+                0.1, beta1=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.AdamW(
+                0.1, beta2=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.AdamW(
+                0.1, epsilon=-1, parameters=linear.parameters())
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
index 55612d71a17a7ae9801535bf5a35c83b100aab30..d3e990ca13eb2911ea04ed546b91f58e2db4e440 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
@@ -49,7 +49,6 @@ class TestAffineGridOp(OpTest):
         self.initTestCase()
         self.op_type = "affine_grid"
         theta = np.random.randint(1, 3, self.theta_shape).astype("float32")
-        theta = np.ones(self.theta_shape).astype("float32")
         self.inputs = {'Theta': theta}
         self.attrs = {
             "use_cudnn": self.use_cudnn,
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
index 7c1f9d802c31ac2c3b244541936ba25018e1487a..1b1b1d7c983282974d2fa46038c35c98de4f9ec2 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
@@ -218,7 +218,7 @@ def create_test_case(op_type):
                 self.assertTrue("test_arg_api" in result.name)
 
         def run_dygraph(self, place):
-            paddle.disable_static()
+            paddle.disable_static(place)
             op = eval("paddle.%s" % (op_type))
             data_tensor = paddle.to_tensor(self.input_data)
 
@@ -240,7 +240,7 @@ def create_test_case(op_type):
             #case 4 
             result_data = op(data_tensor, axis=-1, keepdim=True)
             excepted_data = self.numpy_op(self.input_data, axis=-1)
-            excepted_data = excepted_data.reshape((10))
+            excepted_data = excepted_data.reshape((10, 1))
             self.assertTrue((result_data.numpy() == excepted_data).all(), True)
 
             #case 5 
@@ -299,14 +299,42 @@ class TestArgMinMaxOpError(unittest.TestCase):
                     name="test_argmax", shape=[10], dtype="float32")
                 output = paddle.argmax(x=data, dtype="float32")
 
-            self.assertRaises(ValueError, test_argmax_attr_type)
+            self.assertRaises(TypeError, test_argmax_attr_type)
 
             def test_argmin_attr_type():
                 data = paddle.static.data(
                     name="test_argmax", shape=[10], dtype="float32")
                 output = paddle.argmin(x=data, dtype="float32")
 
-            self.assertRaises(ValueError, test_argmin_attr_type)
+            self.assertRaises(TypeError, test_argmin_attr_type)
+
+            def test_argmax_axis_type():
+                data = paddle.static.data(
+                    name="test_argmax", shape=[10], dtype="float32")
+                output = paddle.argmax(x=data, axis=1.2)
+
+            self.assertRaises(TypeError, test_argmax_axis_type)
+
+            def test_argmin_axis_type():
+                data = paddle.static.data(
+                    name="test_argmin", shape=[10], dtype="float32")
+                output = paddle.argmin(x=data, axis=1.2)
+
+            self.assertRaises(TypeError, test_argmin_axis_type)
+
+            def test_argmax_dtype_type():
+                data = paddle.static.data(
+                    name="test_argmax", shape=[10], dtype="float32")
+                output = paddle.argmax(x=data, dtype=1)
+
+            self.assertRaises(TypeError, test_argmax_dtype_type)
+
+            def test_argmin_dtype_type():
+                data = paddle.static.data(
+                    name="test_argmin", shape=[10], dtype="float32")
+                output = paddle.argmin(x=data, dtype=1)
+
+            self.assertRaises(TypeError, test_argmin_dtype_type)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 5c705378e515eec4c950f6996e2789df603fcda3..2af0b31d6fc26c59803f29dcdc54979491767dd2 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -43,6 +43,21 @@ class TestBatchNorm(unittest.TestCase):
             x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
             x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
 
+            def error1d_dataformat():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm1d = paddle.nn.BatchNorm1d(1, data_format='NCDHW')
+                batch_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d_dataformat():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                batch_norm2d = paddle.nn.BatchNorm2d(1, data_format='NCDHW')
+                batch_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d_dataformat():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm3d = paddle.nn.BatchNorm3d(1, data_format='NCL')
+                batch_norm3d(fluid.dygraph.to_variable(x_data_4))
+
             def error1d():
                 x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
                 batch_norm1d = paddle.nn.BatchNorm1d(1)
@@ -62,6 +77,9 @@ class TestBatchNorm(unittest.TestCase):
                 self.assertRaises(ValueError, error1d)
                 self.assertRaises(ValueError, error2d)
                 self.assertRaises(ValueError, error3d)
+                self.assertRaises(ValueError, error1d_dataformat)
+                self.assertRaises(ValueError, error2d_dataformat)
+                self.assertRaises(ValueError, error3d_dataformat)
 
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 2e1f9d41747e3a99b4b4a0650a52973459b85c7b..b56d9f6668e8bcbd37443fb88b1f5f4dd40a2511 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -138,8 +138,9 @@ class TestClipAPI(unittest.TestCase):
         out_6 = paddle.clip(images, max=max)
         out_7 = paddle.clip(images, max=-1.)
         out_8 = paddle.clip(images)
+        out_9 = paddle.clip(paddle.cast(images, 'float64'), min=0.2, max=0.9)
 
-        res1, res2, res3, res4, res5, res6, res7, res8 = exe.run(
+        res1, res2, res3, res4, res5, res6, res7, res8, res9 = exe.run(
             fluid.default_main_program(),
             feed={
                 "image": data,
@@ -147,7 +148,7 @@ class TestClipAPI(unittest.TestCase):
                 "max": np.array([0.8]).astype('float32')
             },
             fetch_list=[
-                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8, out_9
             ])
 
         self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
@@ -158,6 +159,8 @@ class TestClipAPI(unittest.TestCase):
         self.assertTrue(np.allclose(res6, data.clip(max=0.8)))
         self.assertTrue(np.allclose(res7, data.clip(max=-1)))
         self.assertTrue(np.allclose(res8, data))
+        self.assertTrue(
+            np.allclose(res9, data.astype(np.float64).clip(0.2, 0.9)))
 
     def test_clip_dygraph(self):
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
index da527b26bf0608da5a648d92b492ff27cf2802f0..35fce9e9d6ba9d7a2f264bdd5c1f3deb7a2a67e9 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -44,7 +44,7 @@ class Conv1dTestCase(unittest.TestCase):
         self.spartial_shape = spartial_shape
         self.filter_size = filter_size
         self.data_format = data_format
-        self.channel_last = (self.data_format == "NHWC")
+        self.channel_last = (self.data_format == "NLC")
 
         self.padding = padding
         self.padding_mode = padding_mode
@@ -147,6 +147,14 @@ class Conv1dErrorTestCase(Conv1dTestCase):
                 self.paddle_nn_layer()
 
 
+class Conv1dTypeErrorTestCase(Conv1dTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(TypeError):
+                self.paddle_nn_layer()
+
+
 def add_cases(suite):
     suite.addTest(Conv1dTestCase(methodName='runTest'))
     suite.addTest(Conv1dTestCase(methodName='runTest', stride=[1], dilation=2))
@@ -161,6 +169,7 @@ def add_cases(suite):
         Conv1dTestCase(
             methodName='runTest', padding=2, data_format='NLC'))
     suite.addTest(Conv1dTestCase(methodName='runTest', padding=[1]))
+    suite.addTest(Conv1dTestCase(methodName='runTest', padding=[1, 2]))
     suite.addTest(Conv1dTestCase(methodName='runTest', padding=2))
     suite.addTest(Conv1dTestCase(methodName='runTest'))
     suite.addTest(
@@ -178,7 +187,7 @@ def add_cases(suite):
 
 def add_error_cases(suite):
     suite.addTest(
-        Conv1dErrorTestCase(
+        Conv1dTypeErrorTestCase(
             methodName='runTest', padding_mode="reflect", padding="valid"))
     suite.addTest(
         Conv1dErrorTestCase(
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
index 73227dd3610376d85fcfc70bb2653dfd927427fd..4c98aacd209dab8e5dc9e7744922a927700c4bb3 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
@@ -201,6 +201,7 @@ def add_cases(suite):
         ConvTranspose1dTestCase(
             methodName='runTest', data_format="NLC", stride=3,
             output_padding=2))
+    suite.addTest(ConvTranspose1dTestCase(methodName='runTest', padding=[1, 2]))
 
 
 def add_error_cases(suite):
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c2520038a82a0b9427b2cbe1d4010a1bc8e040c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -0,0 +1,163 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cloud role maker."""
+
+from __future__ import print_function
+import os
+import unittest
+import paddle.fluid.generator as generator
+
+import time  # temp for debug
+import paddle.fluid as fluid
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+
+class TestGeneratorSeed(unittest.TestCase):
+    """
+    Test cases for cpu generator seed.
+    """
+
+    def test_gen_dropout_dygraph(self):
+        gen = paddle.manual_seed(12343)
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(111111111)
+        st = paddle.get_cuda_rng_state()
+
+        x = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x_again = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x_third = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        print("x: {}".format(x.numpy()))
+        print("x_again: {}".format(x_again.numpy()))
+        x = x + x_again + x_third
+        y = fluid.layers.dropout(x, 0.5)
+
+        paddle.set_cuda_rng_state(st)
+
+        x1 = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x1_again = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x1_third = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x1 = x1 + x1_again + x1_third
+        y1 = fluid.layers.dropout(x1, 0.5)
+        y_np = y.numpy()
+        y1_np = y1.numpy()
+
+        if core.is_compiled_with_cuda():
+            print(">>>>>>> dropout dygraph >>>>>>>")
+            self.assertTrue(np.allclose(y_np, y1_np))
+
+    def test_generator_gaussian_random_dygraph(self):
+        """Test Generator seed."""
+        fluid.enable_dygraph()
+
+        paddle.manual_seed(12312321111)
+        x = fluid.layers.gaussian_random([120], dtype="float32")
+        st1 = paddle.get_cuda_rng_state()
+        x1 = fluid.layers.gaussian_random([120], dtype="float32")
+        paddle.set_cuda_rng_state(st1)
+        x2 = fluid.layers.gaussian_random([120], dtype="float32")
+        paddle.manual_seed(12312321111)
+        x3 = fluid.layers.gaussian_random([120], dtype="float32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if core.is_compiled_with_cuda():
+            print(">>>>>>> gaussian random dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_randint_dygraph(self):
+        """Test Generator seed."""
+
+        fluid.enable_dygraph()
+
+        gen = paddle.manual_seed(12312321111)
+        x = paddle.randint(low=10, shape=[10], dtype="int32")
+        st1 = gen.get_state()
+        x1 = paddle.randint(low=10, shape=[10], dtype="int32")
+        gen.set_state(st1)
+        x2 = paddle.randint(low=10, shape=[10], dtype="int32")
+        paddle.manual_seed(12312321111)
+        x3 = paddle.randint(low=10, shape=[10], dtype="int32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if core.is_compiled_with_cuda():
+            print(">>>>>>> randint dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_gen_TruncatedNormal_initializer(self):
+        fluid.disable_dygraph()
+
+        gen = paddle.manual_seed(123123143)
+        cur_state = paddle.get_cuda_rng_state()
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            x = fluid.layers.uniform_random(shape=[2, 10])
+            result_1 = fluid.layers.fc(
+                input=x,
+                size=10,
+                param_attr=fluid.initializer.TruncatedNormal(
+                    loc=0.0, scale=2.0))
+            result_2 = fluid.layers.fc(
+                input=x,
+                size=10,
+                param_attr=fluid.initializer.TruncatedNormal(
+                    loc=0.0, scale=2.0))
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+        paddle.manual_seed(123123143)
+        with fluid.program_guard(train_program, startup_program):
+            exe.run(startup_program)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+        out1_res1 = np.array(out1[0])
+        out1_res2 = np.array(out1[1])
+        out2_res1 = np.array(out2[0])
+        out2_res2 = np.array(out2[1])
+
+        if core.is_compiled_with_cuda():
+            print(">>>>>>> truncated normal static >>>>>>>")
+            self.assertTrue(np.allclose(out1_res1, out2_res1))
+            self.assertTrue(np.allclose(out1_res2, out2_res2))
+            self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
new file mode 100755
index 0000000000000000000000000000000000000000..2a80e20d692c88497e7edccd6eca5509e3522871
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.static import Program, program_guard
+import unittest
+import paddle.fluid.core as core
+import sys
+
+LOWEST_WARNING_POSTION = 3
+ERROR_WARNING_POSTION = sys.maxsize
+
+# custom paddle version
+paddle.version.major = '1'
+paddle.version.minor = '8'
+paddle.version.patch = '0'
+paddle.version.rc = '0'
+paddle.__version__ = '1.8.0'
+paddle.version.full_version = '1.8.0'
+print("current paddle version: ", paddle.__version__)
+
+paddle.disable_static()
+
+
+def get_warning_index(api):
+    """
+    Given an paddle API, return the index of the Warinng information in its doc string if exists; 
+    If Warinng information doesn't exist, return the default ERROR_WARNING_POSTION, sys.maxsize.
+
+    Args:
+        API (python object)
+
+    Returns:
+        index (int): the index of the Warinng information in its doc string if exists.
+    """
+
+    doc_lst = api.__doc__.splitlines()
+    for idx, val in enumerate(doc_lst):
+        if val.startswith("Warning: ") and val.endswith(
+                " instead."
+        ) and "and will be removed in future versions." in val:
+            return idx
+    return ERROR_WARNING_POSTION
+
+
+class TestDeprecatedDocorator(unittest.TestCase):
+    """
+    tests for paddle's Deprecated Docorator.
+    test_fluid_data: test for old fluid.data API.
+    test_fluid_elementwise_mul: test for old fluid.layers.elementwise_xxx APIs.
+    test_new_multiply: test for new api, which should not insert warning information.
+    test_ops_elementwise_mul: test for C++ elementwise_mul op, which should not insert warning information.
+    """
+
+    def test_fluid_data(self):
+        """
+        test old fluid elementwise_mul api, it should fire Warinng function, 
+        which insert the Warinng info on top of API's doc string.
+        """
+        # Initialization
+        x = fluid.data(name='x', shape=[3, 2, 1], dtype='float32')
+
+        # expected
+        expected = LOWEST_WARNING_POSTION
+
+        # captured        
+        captured = get_warning_index(fluid.data)
+
+        # testting
+        self.assertGreater(expected, captured)
+
+    def test_fluid_elementwise_mul(self):
+        """
+        test old fluid elementwise_mul api, it should trigger Warinng function, 
+        which insert the Warinng info on top of API's doc string.
+        """
+
+        # Initialization
+        a = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
+        b = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
+        x = paddle.to_tensor(a)
+        y = paddle.to_tensor(b)
+        res = fluid.layers.elementwise_mul(x, y)
+
+        # expected
+        expected = LOWEST_WARNING_POSTION
+
+        # captured   
+        captured = get_warning_index(fluid.layers.elementwise_mul)
+
+        # testting
+        self.assertGreater(expected, captured)
+
+    def test_new_multiply(self):
+        """
+        Test for new multiply api, expected result should be False.
+        """
+
+        a = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
+        b = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
+        x = paddle.to_tensor(a)
+        y = paddle.to_tensor(b)
+        res = paddle.multiply(x, y)
+
+        # expected
+        expected = LOWEST_WARNING_POSTION
+
+        # captured        
+        captured = get_warning_index(paddle.multiply)
+
+        # testting
+        self.assertLess(expected, captured)
+
+    def test_ops_elementwise_mul(self):
+        """
+        Test for new C++ elementwise_op, expected result should be True, 
+        because not matter what fluid.layers.elementwise_mul is deprecated.
+        """
+
+        a = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
+        b = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
+        x = paddle.to_tensor(a)
+        y = paddle.to_tensor(b)
+        res = core.ops.elementwise_mul(x, y)
+
+        # expected
+        expected = LOWEST_WARNING_POSTION
+
+        # captured        
+        captured = get_warning_index(fluid.layers.elementwise_mul)
+
+        # testting
+        self.assertGreater(expected, captured)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 2919ec5e9ca97b1d59af46a54b2d702cb6de4a14..529fff158c55fc30248b9f5a88c8c615a8b55c79 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -43,7 +43,7 @@ class TestDirectory(unittest.TestCase):
             'paddle.distributed.prepare_context', 'paddle.DataParallel',
             'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
             'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
-            'paddle.jit.save', 'paddle.jit.load', 'paddle.jit.SaveLoadConfig',
+            'paddle.jit.save', 'paddle.jit.load', 'paddle.SaveLoadConfig',
             'paddle.NoamDecay', 'paddle.PiecewiseDecay',
             'paddle.NaturalExpDecay', 'paddle.ExponentialDecay',
             'paddle.InverseTimeDecay', 'paddle.PolynomialDecay',
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index faff81fa84fb5fa66c9ff14f782d2301e3964672..f4d368b6b6f52f3071320eaffbeedc8d14d63d2e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -488,6 +488,50 @@ class TestParallelDyGraphRunnerBase(object):
             model.clear_gradients()
         return out_losses
 
+    def run_gpu_fleet_api_trainer(self, args):
+        import paddle.distributed.fleet as fleet
+        import paddle.distributed.fleet.base.role_maker as role_maker
+        # 1. enable dygraph
+        paddle.disable_static()
+
+        # 2. init seed
+        seed = 90
+        paddle.static.default_startup_program().random_seed = seed
+        paddle.static.default_main_program().random_seed = seed
+        np.random.seed(seed)
+        random.seed = seed
+        # get trainer id
+        args.trainer_id = paddle.distributed.get_rank()
+
+        # 3. init parallel env
+        if args.update_method == "nccl2":
+            fleet.init(is_collective=True)
+
+        # 4. train model
+        model, train_reader, opt = self.get_model()
+        if args.update_method == "nccl2":
+            opt = fleet.distributed_optimizer(opt)
+            model = fleet.distributed_model(model)
+
+        out_losses = []
+        for step_id, data in enumerate(train_reader()):
+            data = self._get_data(data, args)
+            if step_id == RUN_STEP:
+                break
+            loss = self.run_one_loop(model, opt, data)
+            out_losses.append(loss.numpy())
+
+            if args.update_method == "nccl2":
+                loss = model.scale_loss(loss)
+
+            loss.backward()
+            if args.update_method == "nccl2":
+                model.apply_collective_grads()
+
+            opt.step()
+            opt.clear_grad()
+        print_to_out(out_losses)
+
 
 def runtime_main(test_class):
     parser = argparse.ArgumentParser(description='Run dist test.')
@@ -687,7 +731,8 @@ class TestDistBase(unittest.TestCase):
             envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
             cmd += " -m coverage run --branch -p"
 
-        cmd += " %s --role trainer --lr %f" % (model, self._lr)
+        cmd += " %s --role trainer --update_method local --lr %f" % (model,
+                                                                     self._lr)
 
         if batch_size != DEFAULT_BATCH_SIZE:
             cmd += " --batch_size %d" % batch_size
@@ -850,6 +895,7 @@ class TestDistBase(unittest.TestCase):
         if self.__use_cuda:
             tr_cmd += " --use_cuda"
             env.update({
+                "FLAGS_selected_gpus": "{}".format(0),
                 "CUDA_VISIBLE_DEVICES": "{}".format(trainer_id % 2),
                 "PADDLE_TRAINERS_NUM": "{}".format(trainer_num),
                 "PADDLE_TRAINER_ID": "{}".format(trainer_id),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index b506f179143412e2bdb5d9eda511d90a0a3eea6d..e2336caac1c07f555280b82ba8fcfa7e5ec7f5b8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -156,40 +156,5 @@ class TestDistCtrHalfAsync2x2(TestFleetBase):
             "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
 
-class TestDistCtrPsGpuPyreaderAsync2x2(TestFleetBase):
-    def _setup_config(self):
-        self._mode = "async"
-        self._reader = "pyreader"
-
-    def check_with_place(self,
-                         model_file,
-                         delta=1e-3,
-                         check_error_log=False,
-                         need_envs={}):
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "FLAGS_rpc_deadline": "30000",  # 5sec to fail fast
-            "http_proxy": "",
-            "FLAGS_communicator_send_queue_size": "2",
-            "FLAGS_communicator_max_merge_var_num": "2",
-            "CPU_NUM": "2",
-            "SAVE_MODEL": "1"
-        }
-
-        required_envs.update(need_envs)
-
-        if check_error_log:
-            required_envs["GLOG_v"] = "3"
-            required_envs["GLOG_logtostderr"] = "1"
-
-        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
-
-    def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr_ps_gpu.py", delta=1e-5, check_error_log=True)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
index 0fe7c386c1eeb751f34cf681778132310c304d51..7d18e935f58b6588adbef913c10d3ad497f07b53 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
@@ -21,7 +21,7 @@ import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 from test_dist_fleet_base import TestFleetBase
-from dist_simnet_bow import train_network
+from dist_fleet_simnet_bow import train_network
 
 
 class TestDistGeoCtr_2x2(TestFleetBase):
@@ -72,7 +72,7 @@ class TestGeoSgdTranspiler(unittest.TestCase):
 
         strategy = StrategyFactory.create_geo_strategy(5)
 
-        avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse)
+        avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse)
 
         optimizer = fluid.optimizer.SGD(0.1)
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
index 46616f3dde486e61488d6852ca9efc37a066ab0b..3c68af474cf7cae96a9fa62688460f84123438f5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
@@ -21,7 +21,7 @@ import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
 from test_dist_fleet_base import TestFleetBase
-from dist_simnet_bow import train_network
+from dist_fleet_simnet_bow import train_network
 
 
 @unittest.skip(reason="Skip unstable ut, add it after PR 22957 merged")
@@ -44,7 +44,7 @@ class TestDistGeoClipByGlobalNormTranspiler(unittest.TestCase):
         strategy.geo_sgd_mode = True
         strategy.geo_sgd_need_push_nums = 5
 
-        avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse)
+        avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse)
         fluid.clip.set_gradient_clip(
             clip=fluid.clip.GradientClipByGlobalNorm(2.0))
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
index c3ffd50dc8da16f4a19c8da5383fe7f763aa7a72..02a739c060cd2bd58ecec4d7dc65b65e8a3a35a7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
@@ -36,13 +36,45 @@ class TestDistHeterDatasetAsync2x2(TestFleetHeterBase):
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
             "http_proxy": "",
-            "CPU_NUM": "1"
+            "CPU_NUM": "3"
         }
 
         required_envs.update(need_envs)
 
         if check_error_log:
-            required_envs["GLOG_v"] = "4"
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_heter_ctr.py", delta=1e-5, check_error_log=True)
+
+
+class TestDistHeterPyreaderAsync2x2(TestFleetHeterBase):
+    def _setup_config(self):
+        self._mode = "async"
+        self._reader = "pyreader"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "3"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
             required_envs["GLOG_logtostderr"] = "1"
 
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec34993905e3cfc4603ac48987a690b7fa8a5439
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import tempfile
+from test_dist_fleet_base import TestFleetBase
+
+
+class TestDistSimnetASync2x2(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "async"
+        self._reader = "pyreader"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "2"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_simnet_bow.py", delta=1e-5, check_error_log=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
deleted file mode 100644
index 3189f092413c1f6f1526a5ca66b27f91c95082b1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ /dev/null
@@ -1,161 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import os
-import unittest
-
-from test_dist_base import TestDistBase
-
-import os
-flag_name = os.path.splitext(__file__)[0]
-
-
-class TestDistSimnetBowDense2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '0',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=1e-5,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-class TestDistSimnetBow2x2DenseAsync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._enforce_place = "CPU"
-
-    # FIXME(typhoonzero): fix async tests later
-    def notest_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '0',
-            'IS_SELF_CONTAINED_LR': '1',
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=100,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-class TestDistSimnetBowSparse2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=1e-5,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-class TestDistSimnetBow2x2SparseAsync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=100,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-# FIXME(tangwei): Learningrate variable is not created on pserver.
-class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=1e-5,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-class TestDistSimnetBow2x2LookupTableAsync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=100,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '0'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=1e-5,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
index 533ad9604cf0d879371796fb197e61e931fb479f..47a1c407230527d53327ba57d7b5d7a979bd7d49 100644
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -40,8 +40,11 @@ class DistributionNumpy():
 
 class UniformNumpy(DistributionNumpy):
     def __init__(self, low, high):
-        self.low = np.array(low).astype('float32')
-        self.high = np.array(high).astype('float32')
+        self.low = np.array(low)
+        self.high = np.array(high)
+        if str(self.low.dtype) not in ['float32', 'float64']:
+            self.low = self.low.astype('float32')
+            self.high = self.high.astype('float32')
 
     def sample(self, shape):
         shape = tuple(shape) + (self.low + self.high).shape
@@ -49,13 +52,13 @@ class UniformNumpy(DistributionNumpy):
                            (self.high - self.low))
 
     def log_prob(self, value):
-        lb = np.less(self.low, value).astype('float32')
-        ub = np.less(value, self.high).astype('float32')
+        lb = np.less(self.low, value).astype(self.low.dtype)
+        ub = np.less(value, self.high).astype(self.low.dtype)
         return np.log(lb * ub) - np.log(self.high - self.low)
 
     def probs(self, value):
-        lb = np.less(self.low, value).astype('float32')
-        ub = np.less(value, self.high).astype('float32')
+        lb = np.less(self.low, value).astype(self.low.dtype)
+        ub = np.less(value, self.high).astype(self.low.dtype)
         return (lb * ub) / (self.high - self.low)
 
     def entropy(self):
@@ -64,8 +67,11 @@ class UniformNumpy(DistributionNumpy):
 
 class NormalNumpy(DistributionNumpy):
     def __init__(self, loc, scale):
-        self.loc = np.array(loc).astype('float32')
-        self.scale = np.array(scale).astype('float32')
+        self.loc = np.array(loc)
+        self.scale = np.array(scale)
+        if str(self.loc.dtype) not in ['float32', 'float64']:
+            self.loc = self.loc.astype('float32')
+            self.scale = self.scale.astype('float32')
 
     def sample(self, shape):
         shape = tuple(shape) + (self.loc + self.scale).shape
@@ -83,8 +89,8 @@ class NormalNumpy(DistributionNumpy):
                       (2. * var)) / (math.sqrt(2 * math.pi) * self.scale)
 
     def entropy(self):
-        return 0.5 + 0.5 * np.log(np.array(2. * math.pi).astype(
-            'float32')) + np.log(self.scale)
+        return 0.5 + 0.5 * np.log(
+            np.array(2. * math.pi).astype(self.loc.dtype)) + np.log(self.scale)
 
     def kl_divergence(self, other):
         var_ratio = (self.scale / other.scale)
@@ -94,724 +100,571 @@ class NormalNumpy(DistributionNumpy):
         return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
 
 
-class DistributionTest(unittest.TestCase):
-    def setUp(self, use_gpu=False):
+class UniformTest(unittest.TestCase):
+    def setUp(self, use_gpu=False, batch_size=5, dims=6):
         self.use_gpu = use_gpu
         if not use_gpu:
-            place = fluid.CPUPlace()
+            self.place = fluid.CPUPlace()
             self.gpu_id = -1
         else:
-            place = fluid.CUDAPlace(0)
+            self.place = fluid.CUDAPlace(0)
             self.gpu_id = 0
-        self.executor = fluid.Executor(place)
-
-    def build_normal_common_net(self, batch_size, dims, sample_shape, loc_float,
-                                scale_float, other_loc_float, other_scale_float,
-                                scale_np, other_scale_np, loc_np, other_loc_np,
-                                loc, scale, other_loc, other_scale, values):
-        """Generate Normal object and get the output of its methods including
-        ``sample``, ``entropy``, ``log_prob``, ``probs`` and ``kl_divergence``.
-        Parameters ``loc`` and ``scale`` have different data types to test different situations.
-
-        Args:
-          batch_size(int): The first dimension of the shape of parameters(loc and scale).
-          dims(int): The second dimension of the shape of parameters.
-          sample_shape(int): The sample value used in ``sample`` method.
-          loc_float(float): Generated in function ``get_normal_random_input``, loc is a float number.
-          scale_float(float): Generated in function ``get_normal_random_input``, scale is a float number.
-          other_loc_float(float): Generated in function ``get_normal_random_input``, other_loc is a
-            float number. It is the first parameter in another Normal object used in ``kl_divergence``
-            method.
-          other_scale_float(float): Generated in function ``get_normal_random_input``, other_scale is a
-            float number. It is the second parameter in another Normal object used in ``kl_divergence``
-            method.
-          scale_np(numpy.ndarray): Generated in function ``get_normal_random_input``, An numpy array
-            whose shape is [batch_size, dims].
-          other_scale_np(numpy.ndarray): Generated in function ``get_normal_random_input``, other_scale_np
-            is an numpy array. It is the second parameter in another Normal object used in ``kl_divergence``
-            method.
-          loc_np(numpy.ndarray): Generated in function ``get_normal_random_input``, An numpy array
-            whose shape is [batch_size, dims].
-          other_loc_np(numpy.ndarray): Generated in function ``get_normal_random_input``, other_loc_np
-            is an numpy array. It is the first parameter in another Normal object used in ``kl_divergence``
-            method.
-          loc(Tensor): In dynamic mode, loc is generated in ``build_normal_dygraph``, it's a Tensor filled
-            with ``loc_np`` data. In static mode, loc is generated in ``build_normal_static``, ``layers.data``
-             method is used to get a Placeholder whose shape is [dims].
-          scale(Tensor): In dynamic mode, scale is generated in ``build_normal_dygraph``, it's a Tensor filled
-            with ``scale_np`` data. In static mode, scale is generated in ``build_normal_static``, ``layers.data``
-             method is used to get a Placeholder whose shape is [dims].
-          other_loc(Tensor): In dynamic mode, other_loc is generated in ``build_normal_dygraph``, it's a Tensor
-            filled with ``other_loc_np`` data. In static mode, other_loc is generated in ``build_normal_static``,
-             ``layers.data`` method is used to get a Placeholder whose shape is [dims]. It is the first parameter
-              in another Normal object used in ``kl_divergence`` method.
-          other_scale(Tensor): In dynamic mode, other_scale is generated in ``build_normal_dygraph``, it's a Tensor
-            filled with ``other_scale_np`` data. In static mode, other_scale is generated in ``build_normal_static``,
-             ``layers.data`` method is used to get a Placeholder whose shape is [dims]. It is the second parameter
-              in another Normal object used in ``kl_divergence`` method.
-          values(Tensor): In dynamic mode, values is generated in ``build_normal_dygraph``, it's a Tensor filled with
-             ``values_np`` data. In static mode, values is generated in ``build_normal_static``, ``layers.data``
-             method is used to get a Placeholder whose shape is [dims].
-
-        Returns:
-          List: The elements of the list are the output of sample, entropy, log_prob, probs, kl_divergence methods.
-          The inputs' type of these methods can be float, np.ndarray and Tensor. And broadcast will be considered.
-
-        """
-        normal_int = Normal(int(loc_float), int(scale_float))
-        normal_float = Normal(loc_float, scale_float)
-        other_normal_float = Normal(other_loc_float, other_scale_float)
-
-        normal_float_np_broadcast = Normal(loc_float, scale_np)
-        other_normal_float_np_broadcast = Normal(other_loc_float,
-                                                 other_scale_np)
-
-        normal_np = Normal(loc_np, scale_np)
-        other_normal_np = Normal(other_loc_np, other_scale_np)
-
-        normal_variable = Normal(loc, scale)
-        other_normal_variable = Normal(other_loc, other_scale)
-
-        sample_int = normal_int.sample([batch_size, dims])
-        sample_float = normal_float.sample([batch_size, dims])
-        sample_float_np_broadcast = normal_float_np_broadcast.sample(
-            [batch_size, dims])
-        sample_np = normal_np.sample([batch_size, dims])
-        sample_variable = normal_variable.sample([batch_size, dims])
-
-        sample_int_diff = normal_int.sample([sample_shape])
-        sample_float_diff = normal_float.sample([sample_shape])
-        sample_float_np_broadcast_diff = normal_float_np_broadcast.sample(
-            [sample_shape])
-        sample_np_diff = normal_np.sample([sample_shape])
-        sample_variable_diff = normal_variable.sample([sample_shape])
-
-        entropy_int = normal_int.entropy()
-        entropy_float = normal_float.entropy()
-        entropy_float_np_broadcast = normal_float_np_broadcast.entropy()
-        entropy_np = normal_np.entropy()
-        entropy_variable = normal_variable.entropy()
-
-        lp_float_np_broadcast = normal_float_np_broadcast.log_prob(values)
-        lp_np = normal_np.log_prob(values)
-        lp_variable = normal_variable.log_prob(values)
-
-        p_float_np_broadcast = normal_float_np_broadcast.probs(values)
-        p_np = normal_np.probs(values)
-        p_variable = normal_variable.probs(values)
-
-        kl_float = normal_float.kl_divergence(other_normal_float)
-        kl_float_np_broadcast = normal_float_np_broadcast.kl_divergence(
-            other_normal_float_np_broadcast)
-        kl_np = normal_np.kl_divergence(other_normal_np)
-        kl_variable = normal_variable.kl_divergence(other_normal_variable)
-
-        fetch_list = [
-            sample_int, sample_float, sample_float_np_broadcast, sample_np,
-            sample_variable, sample_int_diff, sample_float_diff,
-            sample_float_np_broadcast_diff, sample_np_diff,
-            sample_variable_diff, entropy_int, entropy_float,
-            entropy_float_np_broadcast, entropy_np, entropy_variable,
-            lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
-            p_np, p_variable, kl_float, kl_float_np_broadcast, kl_np,
-            kl_variable
-        ]
-        return fetch_list
-
-    def build_normal_static(self, test_program, batch_size, dims, sample_shape,
-                            loc_float, scale_float, other_loc_float,
-                            other_scale_float, scale_np, other_scale_np, loc_np,
-                            other_loc_np, values_np):
-        """
-        In static mode, generate feed data of Normal network, and get output fetch_list using
-        ``build_normal_common_net``.
-
-        Args:
-          test_program: In static mode, the Program object.
-          other args can refer to function ``build_normal_common_net``.
-
-        Returns:
-          feed_vars: The feed data of Normal network in static mode.
-          fetch_list: The output is generated by function ``build_normal_common_net``.
-        """
-        with fluid.program_guard(test_program):
-            loc = layers.data(name='loc', shape=[dims], dtype='float32')
-            scale = layers.data(name='scale', shape=[dims], dtype='float32')
-
-            other_loc = layers.data(
-                name='other_loc', shape=[dims], dtype='float32')
-            other_scale = layers.data(
-                name='other_scale', shape=[dims], dtype='float32')
 
-            values = layers.data(name='values', shape=[dims], dtype='float32')
+        self.init_numpy_data(batch_size, dims)
 
-            fetch_list = self.build_normal_common_net(
-                batch_size, dims, sample_shape, loc_float, scale_float,
-                other_loc_float, other_scale_float, scale_np, other_scale_np,
-                loc_np, other_loc_np, loc, scale, other_loc, other_scale,
-                values)
+        paddle.disable_static(self.place)
+        self.init_dynamic_data(batch_size, dims)
 
-        feed_vars = {
-            'loc': loc_np,
-            'scale': scale_np,
-            'other_loc': other_loc_np,
-            'other_scale': other_scale_np,
-            'values': values_np
-        }
-        return feed_vars, fetch_list
-
-    def build_normal_dygraph(self, batch_size, dims, sample_shape, loc_float,
-                             scale_float, other_loc_float, other_scale_float,
-                             scale_np, other_scale_np, loc_np, other_loc_np,
-                             values_np):
-        """
-        In dynamic mode, generate input data of Normal network, and get output fetch_list using
-        ``build_normal_common_net``.
-
-        Args:
-          refer to function ``build_normal_common_net``.
-
-        Returns:
-          fetch_list_numpy: The output is generated by function ``build_normal_common_net``. Transform
-          these tensor to numpy.ndarray.
-        """
-        loc = paddle.to_tensor(loc_np)
-        scale = paddle.to_tensor(scale_np)
-        other_loc = paddle.to_tensor(other_loc_np)
-        other_scale = paddle.to_tensor(other_scale_np)
-        values = paddle.to_tensor(values_np)
-
-        fetch_list = self.build_normal_common_net(
-            batch_size, dims, sample_shape, loc_float, scale_float,
-            other_loc_float, other_scale_float, scale_np, other_scale_np,
-            loc_np, other_loc_np, loc, scale, other_loc, other_scale, values)
-        fetch_list_numpy = [t.numpy() for t in fetch_list]
-        return fetch_list_numpy
-
-    def get_normal_random_input(self, batch_size, dims):
-        """
-        Generate input data ``loc`` and ``scale`` used in Normal network.
-
-        Args:
-          refer to function ``build_normal_common_net``.
-
-        Returns:
-          List: Different data type of ``loc`` and ``scale``, including float, numpy.ndarray.
-          By the way, ``other_loc`` and ``other_scale`` are used in ``kl_divergence`` method.
-          refer to ``args`` in function ``build_normal_common_net``.
-        """
-        loc_np = np.random.randn(batch_size, dims).astype('float32')
-        other_loc_np = np.random.randn(batch_size, dims).astype('float32')
-
-        loc_float = (np.random.ranf() - 0.5) * 4
-        scale_float = (np.random.ranf() - 0.5) * 4
-        while scale_float < 0:
-            scale_float = (np.random.ranf() - 0.5) * 4
-
-        other_loc_float = (np.random.ranf() - 0.5) * 4
-        other_scale_float = (np.random.ranf() - 0.5) * 4
-        while other_scale_float < 0:
-            other_scale_float = (np.random.ranf() - 0.5) * 4
-
-        scale_np = np.random.randn(batch_size, dims).astype('float32')
-        other_scale_np = np.random.randn(batch_size, dims).astype('float32')
-        values_np = np.random.randn(batch_size, dims).astype('float32')
-
-        while not np.all(scale_np > 0):
-            scale_np = np.random.randn(batch_size, dims).astype('float32')
-        while not np.all(other_scale_np > 0):
-            other_scale_np = np.random.randn(batch_size, dims).astype('float32')
-        return [
-            loc_np, other_loc_np, loc_float, scale_float, other_loc_float,
-            other_scale_float, scale_np, other_scale_np, values_np
-        ]
-
-    def compare_normal_with_numpy(self,
-                                  data_list,
-                                  output_list,
-                                  batch_size=2,
-                                  dims=3,
-                                  sample_shape=7,
-                                  tolerance=1e-6):
-        """
-        Compare the outputs of Normal's methods in paddle and numpy. If the outputs are not consistent,
-        raise errors.
-
-        Args:
-          data_list: Input data generated by function ``get_normal_random_input``.
-          output_list: The outputs of Normal's methods in static or dynamic mode.
-          batch_size(int): The first dimension of the shape of parameters(loc and scale).
-          dims(int): The second dimension of the shape of parameters.
-          sample_shape(int): The sample value used in ``sample`` method.
-          tolerance(float): The tolerance of the error.
-        """
-        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
-
-        np_normal_int = NormalNumpy(int(loc_float), int(scale_float))
-        np_normal_float = NormalNumpy(loc_float, scale_float)
-        np_other_normal_float = NormalNumpy(other_loc_float, other_scale_float)
-        np_normal_float_np_broadcast = NormalNumpy(loc_float, scale_np)
-        np_other_normal_float_np_broadcast = NormalNumpy(other_loc_float,
-                                                         other_scale_np)
-        np_normal = NormalNumpy(loc_np, scale_np)
-        np_other_normal = NormalNumpy(other_loc_np, other_scale_np)
-
-        gt_sample_int = np_normal_int.sample([batch_size, dims])
-        gt_sample_float = np_normal_float.sample([batch_size, dims])
-        gt_sample_float_np_broadcast = np_normal_float_np_broadcast.sample(
-            [batch_size, dims])
-        gt_sample_np = np_normal.sample([batch_size, dims])
-
-        gt_sample_int_diff = np_normal_int.sample([sample_shape])
-        gt_sample_float_diff = np_normal_float.sample([sample_shape])
-        gt_sample_float_np_broadcast_diff = np_normal_float_np_broadcast.sample(
-            [sample_shape])
-        gt_sample_np_diff = np_normal.sample([sample_shape])
-
-        gt_entropy_int = np_normal_int.entropy()
-        gt_entropy_float = np_normal_float.entropy()
-        gt_entropy_float_np_broadcast = np_normal_float_np_broadcast.entropy()
-        gt_entropy = np_normal.entropy()
-        gt_lp_float_np_broadcast = np_normal_float_np_broadcast.log_prob(
-            values_np)
-        gt_lp = np_normal.log_prob(values_np)
-        gt_p_float_np_broadcast = np_normal_float_np_broadcast.probs(values_np)
-        gt_p = np_normal.probs(values_np)
-        gt_kl_float = np_normal_float.kl_divergence(np_other_normal_float)
-        gt_kl_float_np_broadcast = np_normal_float_np_broadcast.kl_divergence(
-            np_other_normal_float_np_broadcast)
-        gt_kl = np_normal.kl_divergence(np_other_normal)
-
-        [
-            output_sample_int, output_sample_float,
-            output_sample_float_np_broadcast, output_sample_np,
-            output_sample_variable, output_sample_int_diff,
-            output_sample_float_diff, output_sample_float_np_broadcast_diff,
-            output_sample_np_diff, output_sample_variable_diff,
-            output_entropy_int, output_entropy_float,
-            output_entropy_float_np_broadcast, output_entropy_np,
-            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
-            output_lp_variable, output_p_float_np_broadcast, output_p_np,
-            output_p_variable, output_kl_float, output_kl_float_np_broadcast,
-            output_kl_np, output_kl_variable
-        ] = output_list
-
-        np.testing.assert_equal(output_sample_int.shape, gt_sample_int.shape)
-        np.testing.assert_equal(output_sample_float.shape,
-                                gt_sample_float.shape)
-        np.testing.assert_equal(output_sample_float_np_broadcast.shape,
-                                gt_sample_float_np_broadcast.shape)
-        np.testing.assert_equal(output_sample_np.shape, gt_sample_np.shape)
-        np.testing.assert_equal(output_sample_variable.shape,
-                                gt_sample_np.shape)
-        np.testing.assert_equal(output_sample_int_diff.shape,
-                                gt_sample_int_diff.shape)
-        np.testing.assert_equal(output_sample_float_diff.shape,
-                                gt_sample_float_diff.shape)
-        np.testing.assert_equal(output_sample_float_np_broadcast_diff.shape,
-                                gt_sample_float_np_broadcast_diff.shape)
-        np.testing.assert_equal(output_sample_np_diff.shape,
-                                gt_sample_np_diff.shape)
-        np.testing.assert_equal(output_sample_variable_diff.shape,
-                                gt_sample_np_diff.shape)
-        np.testing.assert_allclose(
-            output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float,
-            gt_entropy_float,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float_np_broadcast,
-            gt_entropy_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_float_np_broadcast,
-            gt_lp_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_p_float_np_broadcast,
-            gt_p_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_p_np, gt_p, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_p_variable, gt_p, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_float, gt_kl_float, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_float_np_broadcast,
-            gt_kl_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
+        paddle.enable_static()
+        self.test_program = fluid.Program()
+        self.executor = fluid.Executor(self.place)
+        self.init_static_data(batch_size, dims)
+
+    def init_numpy_data(self, batch_size, dims):
+        # low ans high are 'float'
+        self.low_np = np.random.uniform(-2, 1)
+        self.high_np = np.random.uniform(1, 3)
+        self.values_np = np.array([1.0]).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = self.low_np
+        self.dynamic_high = self.high_np
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[], dtype='float32')
+
+    def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
+        sample, entropy, log_prob, probs = fetch_list
+
+        np_uniform = UniformNumpy(self.low_np, self.high_np)
+        np_sample = np_uniform.sample([sample_shape])
+        np_entropy = np_uniform.entropy()
+        np_lp = np_uniform.log_prob(self.values_np)
+        np_p = np_uniform.probs(self.values_np)
+
+        np.testing.assert_equal(sample.shape, np_sample.shape)
         np.testing.assert_allclose(
-            output_kl_np, gt_kl, rtol=tolerance, atol=tolerance)
+            entropy, np_entropy, rtol=tolerance, atol=tolerance)
         np.testing.assert_allclose(
-            output_kl_variable, gt_kl, rtol=tolerance, atol=tolerance)
-
-    def test_normal_distribution_static(self,
-                                        batch_size=2,
-                                        dims=3,
-                                        sample_shape=7,
-                                        tolerance=1e-6):
-        """
-        Test Normal's methods in static mode.
-
-        Args:
-          refer to ``compare_normal_with_numpy`` function.
-        """
-        test_program = fluid.Program()
-        data_list = self.get_normal_random_input(batch_size, dims)
-        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
-
-        feed_vars, fetch_list = self.build_normal_static(
-            test_program, batch_size, dims, sample_shape, loc_float,
-            scale_float, other_loc_float, other_scale_float, scale_np,
-            other_scale_np, loc_np, other_loc_np, values_np)
-        self.executor.run(fluid.default_startup_program())
+            log_prob, np_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(probs, np_p, rtol=tolerance, atol=tolerance)
 
-        output_list = self.executor.run(program=test_program,
-                                        feed=feed_vars,
-                                        fetch_list=fetch_list)
-
-        self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
-                                       sample_shape, tolerance)
-
-    def test_normal_distribution_dygraph(self,
-                                         batch_size=2,
-                                         dims=3,
-                                         sample_shape=7,
-                                         tolerance=1e-6):
-        """
-        Test Normal's methods in dynamic mode.
-
-        Args:
-          refer to ``compare_normal_with_numpy`` function.
-        """
-        paddle.disable_static()
-        data_list = self.get_normal_random_input(batch_size, dims)
-        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
-
-        output_list = self.build_normal_dygraph(
-            batch_size, dims, sample_shape, loc_float, scale_float,
-            other_loc_float, other_scale_float, scale_np, other_scale_np,
-            loc_np, other_loc_np, values_np)
-
-        self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
-                                       sample_shape, tolerance)
+    def test_uniform_distribution_dygraph(self, sample_shape=7, tolerance=1e-6):
+        paddle.disable_static(self.place)
+        uniform = Uniform(self.dynamic_low, self.dynamic_high)
+        sample = uniform.sample([sample_shape]).numpy()
+        entropy = uniform.entropy().numpy()
+        log_prob = uniform.log_prob(self.dynamic_values).numpy()
+        probs = uniform.probs(self.dynamic_values).numpy()
+        fetch_list = [sample, entropy, log_prob, probs]
+
+        self.compare_with_numpy(fetch_list)
+
+    def test_uniform_distribution_static(self, sample_shape=7, tolerance=1e-6):
         paddle.enable_static()
+        with fluid.program_guard(self.test_program):
+            uniform = Uniform(self.static_low, self.static_high)
+            sample = uniform.sample([sample_shape])
+            entropy = uniform.entropy()
+            log_prob = uniform.log_prob(self.static_values)
+            probs = uniform.probs(self.static_values)
+            fetch_list = [sample, entropy, log_prob, probs]
 
-    def build_uniform_common_net(self, batch_size, dims, sample_shape,
-                                 low_float, high_float, high_np, low_np,
-                                 values_np, low, high, values):
-        """Generate Uniform object and get the output of its methods including ``sample``, ``entropy``,
-         ``log_prob`` and ``probs``.
-        Parameters ``low`` and ``high`` have different data types to test different situations.
-
-        Args:
-          batch_size(int): The first dimension of the shape of parameters(low and high).
-          dims(int): The second dimension of the shape of parameters.
-          sample_shape(int): The sample value used in ``sample`` method.
-          low_float(float): Parameter ``low`` is a float number.
-          high_float(float): Parameter ``high`` is a float number.
-          high_np(numpy.ndarray): An numpy array whose shape is [batch_size, dims].
-          low_np(numpy.ndarray): An numpy array whose shape is [batch_size, dims].
-          values_np(numpy.ndarray): The input of ``log_prob`` and ``probs`` methods. An numpy array whose
-            shape is [batch_size, dims].
-          low(Tensor): In dynamic mode, low is generated in ``build_uniform_dygraph``, it's a Tensor filled
-            with ``low_np`` data. In static mode, low is generated in ``build_uniform_static``.
-          high(Tensor): In dynamic mode, high is generated in ``build_uniform_dygraph``, it's a Tensor filled
-            with ``high_np`` data. In static mode, high is generated in ``build_uniform_static``.
-          values(Tensor): In dynamic mode, values is generated in ``build_uniform_dygraph``, it's a Tensor
-            filled with ``values_np`` data. In static mode, values is generated in ``build_uniform_static``.
-
-        Returns:
-          List: The elements of the list are the output of sample, entropy, log_prob, probs methods.
-          The inputs' type of these methods can be float, np.ndarray and Tensor. And broadcast will be
-           considered.
-
-        """
-        uniform_int = Uniform(int(low_float), int(high_float))
-        uniform_float = Uniform(low_float, high_float)
-        uniform_float_np_broadcast = Uniform(low_float, high_np)
-        uniform_np = Uniform(low_np, high_np)
-        uniform_variable = Uniform(low, high)
-
-        sample_int = uniform_int.sample([batch_size, dims])
-        sample_float = uniform_float.sample([batch_size, dims])
-        sample_float_np_broadcast = uniform_float_np_broadcast.sample(
-            [batch_size, dims])
-        sample_np = uniform_np.sample([batch_size, dims])
-        sample_variable = uniform_variable.sample([batch_size, dims])
-
-        sample_int_diff = uniform_int.sample([sample_shape])
-        sample_float_diff = uniform_float.sample([sample_shape])
-        sample_float_np_broadcast_diff = uniform_float_np_broadcast.sample(
-            [sample_shape])
-        sample_np_diff = uniform_np.sample([sample_shape])
-        sample_variable_diff = uniform_variable.sample([sample_shape])
-
-        entropy_int = uniform_int.entropy()
-        entropy_float = uniform_float.entropy()
-        entropy_float_np_broadcast = uniform_float_np_broadcast.entropy()
-        entropy_np = uniform_np.entropy()
-        entropy_variable = uniform_variable.entropy()
-
-        lp_float_np_broadcast = uniform_float_np_broadcast.log_prob(values)
-        lp_np = uniform_np.log_prob(values)
-        lp_variable = uniform_variable.log_prob(values)
-
-        p_float_np_broadcast = uniform_float_np_broadcast.probs(values)
-        p_np = uniform_np.probs(values)
-        p_variable = uniform_variable.probs(values)
-
-        fetch_list = [
-            sample_int, sample_float, sample_float_np_broadcast, sample_np,
-            sample_variable, sample_int_diff, sample_float_diff,
-            sample_float_np_broadcast_diff, sample_np_diff,
-            sample_variable_diff, entropy_int, entropy_float,
-            entropy_float_np_broadcast, entropy_np, entropy_variable,
-            lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
-            p_np, p_variable
-        ]
-        return fetch_list
-
-    def build_uniform_static(self, test_program, batch_size, dims, sample_shape,
-                             low_float, high_float, high_np, low_np, values_np):
-        """
-        In static mode, generate feed data of Uniform network, and get output fetch_list using
-        ``build_uniform_common_net``.
-
-        Args:
-          test_program: In static mode, the Program object.
-          other args can refer to function ``build_uniform_common_net``.
-
-        Returns:
-          feed_vars: The feed data of Uniform network in static mode.
-          fetch_list: The output is generated by function ``build_uniform_common_net``.
-        """
-        with fluid.program_guard(test_program):
-            low = layers.data(name='low', shape=[dims], dtype='float32')
-            high = layers.data(name='high', shape=[dims], dtype='float32')
-
-            values = layers.data(name='values', shape=[dims], dtype='float32')
-
-            fetch_list = self.build_uniform_common_net(
-                batch_size, dims, sample_shape, low_float, high_float, high_np,
-                low_np, values_np, low, high, values)
-
-        feed_vars = {'low': low_np, 'high': high_np, 'values': values_np}
-        return feed_vars, fetch_list
-
-    def build_uniform_dygraph(self, batch_size, dims, sample_shape, low_float,
-                              high_float, high_np, low_np, values_np):
-        """
-        In dynamic mode, generate input data of Uniform network, and get output fetch_list using
-        ``build_uniform_common_net``.
-
-        Args:
-          refer to function ``build_uniform_common_net``.
-
-        Returns:
-          fetch_list_numpy: The output is generated by function ``build_uniform_common_net``. Transform
-          these tensor to numpy.ndarray.
-        """
-        low = paddle.to_tensor(low_np)
-        high = paddle.to_tensor(high_np)
-        values = paddle.to_tensor(values_np)
-
-        fetch_list = self.build_uniform_common_net(
-            batch_size, dims, sample_shape, low_float, high_float, high_np,
-            low_np, values_np, low, high, values)
-        fetch_list_numpy = [t.numpy() for t in fetch_list]
-        return fetch_list_numpy
-
-    def compare_uniform_with_numpy(self,
-                                   data_list,
-                                   output_list,
-                                   batch_size=2,
-                                   dims=3,
-                                   sample_shape=7,
-                                   tolerance=1e-6):
-        """
-        Compare the outputs of Uniform's methods in paddle and numpy. If the outputs are not consistent,
-        raise errors.
-
-        Args:
-          data_list: Input data including float and numpy.ndarray type of ``low`` and ``high`` parameters.
-          output_list: The outputs of Uniform's methods in static or dynamic mode.
-          batch_size(int): The first dimension of the shape of parameters(low and high).
-          dims(int): The second dimension of the shape of parameters.
-          sample_shape(int): The sample value used in ``sample`` method.
-          tolerance(float): The tolerance of the error.
-        """
-        [low_np, low_float, high_float, high_np, values_np] = data_list
-
-        np_uniform_int = UniformNumpy(int(low_float), int(high_float))
-        np_uniform_float = UniformNumpy(low_float, high_float)
-        np_uniform_float_np_broadcast = UniformNumpy(low_float, high_np)
-        np_uniform = UniformNumpy(low_np, high_np)
-
-        gt_sample_int = np_uniform_int.sample([batch_size, dims])
-        gt_sample_float = np_uniform_float.sample([batch_size, dims])
-        gt_sample_float_np_broadcast = np_uniform_float_np_broadcast.sample(
-            [batch_size, dims])
-        gt_sample_np = np_uniform.sample([batch_size, dims])
-        gt_sample_int_diff = np_uniform_int.sample([sample_shape])
-        gt_sample_float_diff = np_uniform_float.sample([sample_shape])
-        gt_sample_float_np_broadcast_diff = np_uniform_float_np_broadcast.sample(
-            [sample_shape])
-        gt_sample_np_diff = np_uniform.sample([sample_shape])
-        gt_entropy_int = np_uniform_int.entropy()
-        gt_entropy_float = np_uniform_float.entropy()
-        gt_entropy_float_np_broadcast = np_uniform_float_np_broadcast.entropy()
-        gt_entropy = np_uniform.entropy()
-        gt_lp_float_np_broadcast = np_uniform_float_np_broadcast.log_prob(
-            values_np)
-        gt_lp = np_uniform.log_prob(values_np)
-        gt_p_float_np_broadcast = np_uniform_float_np_broadcast.probs(values_np)
-        gt_p = np_uniform.probs(values_np)
-
-        [
-            output_sample_int, output_sample_float,
-            output_sample_float_np_broadcast, output_sample_np,
-            output_sample_variable, output_sample_int_diff,
-            output_sample_float_diff, output_sample_float_np_broadcast_diff,
-            output_sample_np_diff, output_sample_variable_diff,
-            output_entropy_int, output_entropy_float,
-            output_entropy_float_np_broadcast, output_entropy_np,
-            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
-            output_lp_variable, output_p_float_np_broadcast, output_p_np,
-            output_p_variable
-        ] = output_list
-
-        np.testing.assert_equal(output_sample_int.shape, gt_sample_int.shape)
-        np.testing.assert_equal(output_sample_float.shape,
-                                gt_sample_float.shape)
-        np.testing.assert_equal(output_sample_float_np_broadcast.shape,
-                                gt_sample_float_np_broadcast.shape)
-        np.testing.assert_equal(output_sample_np.shape, gt_sample_np.shape)
-        np.testing.assert_equal(output_sample_variable.shape,
-                                gt_sample_np.shape)
-        np.testing.assert_equal(output_sample_int_diff.shape,
-                                gt_sample_int_diff.shape)
-        np.testing.assert_equal(output_sample_float_diff.shape,
-                                gt_sample_float_diff.shape)
-        np.testing.assert_equal(output_sample_float_np_broadcast_diff.shape,
-                                gt_sample_float_np_broadcast_diff.shape)
-        np.testing.assert_equal(output_sample_np_diff.shape,
-                                gt_sample_np_diff.shape)
-        np.testing.assert_equal(output_sample_variable_diff.shape,
-                                gt_sample_np_diff.shape)
-        np.testing.assert_allclose(
-            output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float,
-            gt_entropy_float,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float_np_broadcast,
-            gt_entropy_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_float_np_broadcast,
-            gt_lp_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_p_float_np_broadcast,
-            gt_p_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
+        feed_vars = {
+            'low': self.low_np,
+            'high': self.high_np,
+            'values': self.values_np
+        }
+
+        self.executor.run(fluid.default_startup_program())
+        fetch_list = self.executor.run(program=self.test_program,
+                                       feed=feed_vars,
+                                       fetch_list=fetch_list)
+
+        self.compare_with_numpy(fetch_list)
+
+
+class UniformTest2(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low ans high are 'int'
+        self.low_np = int(np.random.uniform(-2, 1))
+        self.high_np = int(np.random.uniform(1, 3))
+        self.values_np = np.array([1.0]).astype('float32')
+
+
+class UniformTest3(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # test broadcast: low is float, high is numpy.ndarray with dtype 'float32'.
+        self.low_np = np.random.uniform(-2, 1)
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest4(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are numpy.ndarray with dtype 'float32'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float32')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest5(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are numpy.ndarray with dtype 'float64'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float64')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = self.low_np
+        self.dynamic_high = self.high_np
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+
+
+class UniformTest6(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are Tensor with dtype 'VarType.FP32'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float32')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = paddle.to_tensor(self.low_np)
+        self.dynamic_high = paddle.to_tensor(self.high_np)
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_low = layers.data(
+                name='low', shape=[dims], dtype='float32')
+            self.static_high = layers.data(
+                name='high', shape=[dims], dtype='float32')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest7(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are Tensor with dtype 'VarType.FP64'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float64')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = paddle.to_tensor(self.low_np, dtype='float64')
+        self.dynamic_high = paddle.to_tensor(self.high_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_low = layers.data(
+                name='low', shape=[dims], dtype='float64')
+            self.static_high = layers.data(
+                name='high', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+
+
+class UniformTest8(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are Tensor with dtype 'VarType.FP64'. value's dtype is 'VarType.FP32'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float64')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = paddle.to_tensor(self.low_np, dtype='float64')
+        self.dynamic_high = paddle.to_tensor(self.high_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float32')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_low = layers.data(
+                name='low', shape=[dims], dtype='float64')
+            self.static_high = layers.data(
+                name='high', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest(unittest.TestCase):
+    def setUp(self, use_gpu=False, batch_size=2, dims=3):
+        self.use_gpu = use_gpu
+        if not use_gpu:
+            self.place = fluid.CPUPlace()
+            self.gpu_id = -1
+        else:
+            self.place = fluid.CUDAPlace(0)
+            self.gpu_id = 0
+
+        self.init_numpy_data(batch_size, dims)
+
+        paddle.disable_static(self.place)
+        self.init_dynamic_data(batch_size, dims)
+
+        paddle.enable_static()
+        self.test_program = fluid.Program()
+        self.executor = fluid.Executor(self.place)
+        self.init_static_data(batch_size, dims)
+
+    def init_numpy_data(self, batch_size, dims):
+        # loc ans scale are 'float'
+        self.loc_np = (np.random.ranf() - 0.5) * 4
+        self.scale_np = (np.random.ranf() - 0.5) * 4
+        while self.scale_np < 0:
+            self.scale_np = (np.random.ranf() - 0.5) * 4
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = (np.random.ranf() - 0.5) * 4
+        self.other_scale_np = (np.random.ranf() - 0.5) * 4
+        while self.other_scale_np < 0:
+            self.other_scale_np = (np.random.ranf() - 0.5) * 4
+        self.values_np = np.random.ranf(1).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = self.loc_np
+        self.dynamic_scale = self.scale_np
+        self.dynamic_other_loc = self.other_loc_np
+        self.dynamic_other_scale = self.other_scale_np
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[], dtype='float32')
+
+    def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
+        sample, entropy, log_prob, probs, kl = fetch_list
+
+        np_normal = NormalNumpy(self.loc_np, self.scale_np)
+        np_sample = np_normal.sample([sample_shape])
+        np_entropy = np_normal.entropy()
+        np_lp = np_normal.log_prob(self.values_np)
+        np_p = np_normal.probs(self.values_np)
+        np_other_normal = NormalNumpy(self.other_loc_np, self.other_scale_np)
+        np_kl = np_normal.kl_divergence(np_other_normal)
+
+        np.testing.assert_equal(sample.shape, np_sample.shape)
         np.testing.assert_allclose(
-            output_p_np, gt_p, rtol=tolerance, atol=tolerance)
+            entropy, np_entropy, rtol=tolerance, atol=tolerance)
         np.testing.assert_allclose(
-            output_p_variable, gt_p, rtol=tolerance, atol=tolerance)
-
-    def test_uniform_distribution_static(self,
-                                         batch_size=2,
-                                         dims=3,
-                                         sample_shape=7,
-                                         tolerance=1e-6):
-        """
-        Test Uniform's methods in static mode.
-
-        Args:
-          refer to ``compare_uniform_with_numpy`` function.
-        """
-        test_program = fluid.Program()
-
-        low_np = np.random.randn(batch_size, dims).astype('float32')
-        low_float = np.random.uniform(-2, 1)
-        high_float = np.random.uniform(1, 3)
-        high_np = np.random.uniform(-5.0, 5.0,
-                                    (batch_size, dims)).astype('float32')
-        values_np = np.random.randn(batch_size, dims).astype('float32')
-
-        data_list = [low_np, low_float, high_float, high_np, values_np]
-
-        feed_vars, fetch_list = self.build_uniform_static(
-            test_program, batch_size, dims, sample_shape, low_float, high_float,
-            high_np, low_np, values_np)
+            log_prob, np_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(probs, np_p, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(kl, np_kl, rtol=tolerance, atol=tolerance)
 
-        self.executor.run(fluid.default_startup_program())
+    def test_normal_distribution_dygraph(self, sample_shape=7, tolerance=1e-6):
+        paddle.disable_static(self.place)
+        normal = Normal(self.dynamic_loc, self.dynamic_scale)
+
+        sample = normal.sample([sample_shape]).numpy()
+        entropy = normal.entropy().numpy()
+        log_prob = normal.log_prob(self.dynamic_values).numpy()
+        probs = normal.probs(self.dynamic_values).numpy()
+        other_normal = Normal(self.dynamic_other_loc, self.dynamic_other_scale)
+        kl = normal.kl_divergence(other_normal).numpy()
 
-        # result calculated by paddle
-        output_list = self.executor.run(program=test_program,
-                                        feed=feed_vars,
-                                        fetch_list=fetch_list)
-        self.compare_uniform_with_numpy(data_list, output_list, batch_size,
-                                        dims, sample_shape, tolerance)
-
-    def test_uniform_distribution_dygraph(self,
-                                          batch_size=2,
-                                          dims=3,
-                                          sample_shape=7,
-                                          tolerance=1e-6):
-        """
-        Test Uniform's methods in dynamic mode.
-
-        Args:
-          refer to ``compare_uniform_with_numpy`` function.
-        """
-        paddle.disable_static()
-
-        low_np = np.random.randn(batch_size, dims).astype('float32')
-        low_float = np.random.uniform(-2, 1)
-        high_float = np.random.uniform(1, 3)
-        high_np = np.random.uniform(-5.0, 5.0,
-                                    (batch_size, dims)).astype('float32')
-        values_np = np.random.randn(batch_size, dims).astype('float32')
-
-        data_list = [low_np, low_float, high_float, high_np, values_np]
-        output_list = self.build_uniform_dygraph(batch_size, dims, sample_shape,
-                                                 low_float, high_float, high_np,
-                                                 low_np, values_np)
-
-        self.compare_uniform_with_numpy(data_list, output_list, batch_size,
-                                        dims, sample_shape, tolerance)
+        fetch_list = [sample, entropy, log_prob, probs, kl]
+        self.compare_with_numpy(fetch_list)
+
+    def test_normal_distribution_static(self, sample_shape=7, tolerance=1e-6):
         paddle.enable_static()
+        with fluid.program_guard(self.test_program):
+            normal = Normal(self.static_loc, self.static_scale)
+
+            sample = normal.sample([sample_shape])
+            entropy = normal.entropy()
+            log_prob = normal.log_prob(self.static_values)
+            probs = normal.probs(self.static_values)
+            other_normal = Normal(self.static_other_loc,
+                                  self.static_other_scale)
+            kl = normal.kl_divergence(other_normal)
+
+            fetch_list = [sample, entropy, log_prob, probs, kl]
+
+        feed_vars = {
+            'loc': self.loc_np,
+            'scale': self.scale_np,
+            'values': self.values_np,
+            'other_loc': self.other_loc_np,
+            'other_scale': self.other_scale_np
+        }
+
+        self.executor.run(fluid.default_startup_program())
+        fetch_list = self.executor.run(program=self.test_program,
+                                       feed=feed_vars,
+                                       fetch_list=fetch_list)
+
+        self.compare_with_numpy(fetch_list)
+
+
+class NormalTest2(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc ans scale are 'int'
+        self.loc_np = int((np.random.ranf() - 0.5) * 8)
+        self.scale_np = int((np.random.ranf() - 0.5) * 8)
+        while self.scale_np < 0:
+            self.scale_np = int((np.random.ranf() - 0.5) * 8)
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = int((np.random.ranf() - 0.5) * 8)
+        self.other_scale_np = int((np.random.ranf() - 0.5) * 8)
+        while self.other_scale_np < 0:
+            self.other_scale_np = int((np.random.ranf() - 0.5) * 8)
+        self.values_np = np.random.ranf(1).astype('float32')
+
+
+class NormalTest3(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # test broadcast: loc is float, scale is numpy.ndarray with dtype 'float32'.
+        self.loc_np = (np.random.ranf() - 0.5) * 4
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = (np.random.ranf() - 0.5) * 4
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest4(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are numpy.ndarray with dtype 'float32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest5(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are numpy.ndarray with dtype 'float64'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = self.loc_np
+        self.dynamic_scale = self.scale_np
+        self.dynamic_other_loc = self.other_loc_np
+        self.dynamic_other_scale = self.other_scale_np
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+
+
+class NormalTest6(NormalTest):
+    def init_data(self, batch_size=2, dims=3):
+        # loc and scale are Tensor with dtype 'VarType.FP32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        self.loc = paddle.to_tensor(self.loc_np)
+        self.scale = paddle.to_tensor(self.scale_np)
+        self.values = paddle.to_tensor(self.values_np)
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+        self.other_loc = paddle.to_tensor(self.other_loc_np)
+        self.other_scale = paddle.to_tensor(self.other_scale_np)
+
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are Tensor with dtype 'VarType.FP32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = paddle.to_tensor(self.loc_np)
+        self.dynamic_scale = paddle.to_tensor(self.scale_np)
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+        self.dynamic_other_loc = paddle.to_tensor(self.other_loc_np)
+        self.dynamic_other_scale = paddle.to_tensor(self.other_scale_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_loc = layers.data(
+                name='loc', shape=[dims], dtype='float32')
+            self.static_scale = layers.data(
+                name='scale', shape=[dims], dtype='float32')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+            self.static_other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float32')
+            self.static_other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float32')
+
+
+class NormalTest7(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are Tensor with dtype 'VarType.FP64'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = paddle.to_tensor(self.loc_np, dtype='float64')
+        self.dynamic_scale = paddle.to_tensor(self.scale_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+        self.dynamic_other_loc = paddle.to_tensor(
+            self.other_loc_np, dtype='float64')
+        self.dynamic_other_scale = paddle.to_tensor(
+            self.other_scale_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_loc = layers.data(
+                name='loc', shape=[dims], dtype='float64')
+            self.static_scale = layers.data(
+                name='scale', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+            self.static_other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float64')
+            self.static_other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float64')
+
+
+class NormalTest8(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are Tensor with dtype 'VarType.FP64'. value's dtype is 'VarType.FP32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = paddle.to_tensor(self.loc_np, dtype='float64')
+        self.dynamic_scale = paddle.to_tensor(self.scale_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+        self.dynamic_other_loc = paddle.to_tensor(
+            self.other_loc_np, dtype='float64')
+        self.dynamic_other_scale = paddle.to_tensor(
+            self.other_scale_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_loc = layers.data(
+                name='loc', shape=[dims], dtype='float64')
+            self.static_scale = layers.data(
+                name='scale', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+            self.static_other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float64')
+            self.static_other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float64')
 
 
 class DistributionTestError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index ceec1190279212fbe6f3f128bdd1397cdb9ea1a2..7b9e25e1d4ae8dbb8e4a03d93a7d9c0f9dd18ea6 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -40,6 +40,23 @@ class TestDropoutOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestDropoutOpInput1d(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((2000, )).astype("float32")}
+        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((2000)).astype('uint8')
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out')
+
+
 class TestDropoutOp2(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
@@ -436,6 +453,13 @@ class TestDropoutFAPIError(unittest.TestCase):
 
             self.assertRaises(ValueError, test_axis_max)
 
+            def test_axis_min():
+                # minimum of axis should greater equal than 0
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, axis=[0, -1])
+
+            self.assertRaises(ValueError, test_axis_min)
+
             def test_axis_len():
                 # length of axis should not greater than dimensions of x
                 x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
@@ -648,9 +672,11 @@ class TestAlphaDropoutFAPI(unittest.TestCase):
             res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
             res2 = paddle.nn.functional.alpha_dropout(
                 x=input, p=0., training=False)
+            res3 = paddle.nn.functional.alpha_dropout(x=input, p=1.)
 
             in_np = np.random.random([40, 40]).astype("float32")
             res_np = in_np
+            res_np3 = np.zeros_like(in_np)
 
             exe = fluid.Executor(place)
             res_list = [res1, res2]
@@ -659,6 +685,10 @@ class TestAlphaDropoutFAPI(unittest.TestCase):
                                   feed={"input": in_np},
                                   fetch_list=[res])
                 self.assertTrue(np.allclose(fetches[0], res_np))
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": in_np},
+                              fetch_list=[res3])
+            self.assertTrue(np.allclose(fetches[0], res_np3))
 
     def test_static(self):
         for place in self.places:
@@ -669,15 +699,18 @@ class TestAlphaDropoutFAPI(unittest.TestCase):
             with fluid.dygraph.guard(place):
                 in_np = np.random.random([40, 40]).astype("float32")
                 res_np = in_np
+                res_np3 = np.zeros_like(in_np)
                 input = fluid.dygraph.to_variable(in_np)
 
                 res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
                 res2 = paddle.nn.functional.alpha_dropout(
                     x=input, p=0., training=False)
+                res3 = paddle.nn.functional.alpha_dropout(x=input, p=1.)
 
             res_list = [res1, res2]
             for res in res_list:
                 self.assertTrue(np.allclose(res.numpy(), res_np))
+            self.assertTrue(np.allclose(res3.numpy(), res_np3))
 
 
 class TestAlphaDropoutFAPIError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
index f5d8b4f704da8acd97475444346522f63d3724fd..cab6160d761004877896deea8d44ca02c9de2e1e 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
@@ -220,6 +220,14 @@ class TestRemainderAPI(unittest.TestCase):
                 z_expected = np.array([0, 1, 1, -1])
                 self.assertEqual(np.allclose(z_expected, z.numpy()), True)
 
+                np_x = np.array([-3, 3])
+                np_y = np.array([[2, 3], [-2, -1]])
+                x = paddle.to_tensor(np_x, dtype="int64")
+                y = paddle.to_tensor(np_y, dtype="int64")
+                z = x % y
+                z_expected = np.array([[1, 0], [-1, 0]])
+                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 3475320eeebc55a14dd569410610b70ae35e65a3..43069470680c7d49071ce54bf3649962c56f06ea 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -350,6 +350,14 @@ class TestFillConstantOpError(unittest.TestCase):
                 dtype='int16',
                 out=x1)
 
+            self.assertRaises(
+                TypeError,
+                fluid.layers.fill_constant,
+                shape=[1.1],
+                value=5,
+                dtype='float32',
+                out=x1)
+
             # The argument dtype of fill_constant_op must be one of bool, float16,
             #float32, float64, int32 or int64
             x2 = fluid.layers.data(name='x2', shape=[1], dtype="int32")
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 9e651dea24ba7f35f3785093da8ac73dde07be5a..4ced9841ee43e02a3d1e3f292bf97200dec29f5c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -18,6 +18,7 @@ import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 import os
 import paddle.fluid as fluid
+import numpy as np
 
 
 class TestFleetBase(unittest.TestCase):
@@ -125,5 +126,110 @@ class TestFleetBase(unittest.TestCase):
         self.assertRaises(Exception, fleet.init_worker)
 
 
+class TestFleetDygraph(unittest.TestCase):
+    def setUp(self):
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213,127.0.0.1:36214"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def test_dygraph_method(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = fluid.dygraph.to_variable(value)
+        layer = paddle.nn.Linear(13, 5)
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.01, parameters=layer.parameters())
+        # remove init cause this UT cannot launch distributed task
+        adam = fleet.distributed_optimizer(adam)
+        dp_layer = fleet.distributed_model(layer)
+        lr = 0.001
+        adam.set_lr(lr)
+        cur_lr = adam.get_lr()
+        assert (lr == cur_lr)
+        state_dict = adam.state_dict()
+        adam.set_state_dict(state_dict)
+
+
+class TestFleetBaseSingleRunCollective(unittest.TestCase):
+    def setUp(self):
+        os.environ.pop("PADDLE_TRAINER_ENDPOINTS")
+
+    def gen_data(self):
+        return {
+            "x": np.random.random(size=(128, 32)).astype('float32'),
+            "y": np.random.randint(
+                2, size=(128, 1)).astype('int64')
+        }
+
+    def test_single_run_collective_minimize(self):
+        input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
+        input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+        fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+        cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+        avg_cost = paddle.mean(x=cost)
+
+        fleet.init(is_collective=True)
+        optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+        optimizer = fleet.distributed_optimizer(optimizer)
+        optimizer.minimize(avg_cost)
+
+        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        exe = fluid.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+
+        for i in range(10):
+            cost_val = exe.run(feed=self.gen_data(), fetch_list=[avg_cost.name])
+            print("cost of step[{}] = {}".format(i, cost_val))
+
+
+class TestFleetBaseSingleRunPS(unittest.TestCase):
+    def setUp(self):
+        os.environ.pop("PADDLE_PSERVERS_IP_PORT_LIST")
+
+    def gen_data(self):
+        return {
+            "x": np.random.random(size=(128, 32)).astype('float32'),
+            "y": np.random.randint(
+                2, size=(128, 1)).astype('int64')
+        }
+
+    def test_single_run_ps_minimize(self):
+        input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
+        input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+        fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+        cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+        avg_cost = paddle.mean(x=cost)
+
+        fleet.init()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+        if fleet.is_server():
+            fleet.init_server()
+            fleet.run_server()
+        elif fleet.is_worker():
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            step = 100
+            for i in range(step):
+                cost_val = exe.run(program=fluid.default_main_program(),
+                                   feed=self.gen_data(),
+                                   fetch_list=[avg_cost.name])
+                print("worker_index: %d, step%d cost = %f" %
+                      (fleet.worker_index(), i, cost_val[0]))
+            fleet.save_persistables(exe, "fleet_single_model/")
+            print("save fleet models done.")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
index 9eec73116cc283b58d3ee39cefb9256e12d4ef15..927c155ff1116a821a13730a9d2a779a7c68b254 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
@@ -190,7 +190,7 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(
                 optimizer, strategy=strategy)
             optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index fc668ce3493e96e0790af522a439367fe10455f3..dddc6811ef08bdf8504cb6b4fe09813336875b10 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -239,24 +239,24 @@ class TestGaussianRandomAPI(unittest.TestCase):
     def test_default_dtype(self):
         paddle.disable_static()
 
-        def test_default_fp_16():
+        def test_default_fp16():
             paddle.framework.set_default_dtype('float16')
-            paddle.tensor.random.gaussian_random([2, 3])
+            paddle.tensor.random.gaussian([2, 3])
 
-        self.assertRaises(TypeError, test_default_fp_16)
+        self.assertRaises(TypeError, test_default_fp16)
 
-        def test_default_fp_32():
+        def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
-            out = paddle.tensor.random.gaussian_random([2, 3])
+            out = paddle.tensor.random.gaussian([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
 
-        def test_default_fp_64():
+        def test_default_fp64():
             paddle.framework.set_default_dtype('float64')
-            out = paddle.tensor.random.gaussian_random([2, 3])
+            out = paddle.tensor.random.gaussian([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
-        test_default_fp_64()
-        test_default_fp_32()
+        test_default_fp64()
+        test_default_fp32()
 
         paddle.enable_static()
 
@@ -265,24 +265,24 @@ class TestStandardNormalDtype(unittest.TestCase):
     def test_default_dtype(self):
         paddle.disable_static()
 
-        def test_default_fp_16():
+        def test_default_fp16():
             paddle.framework.set_default_dtype('float16')
             paddle.tensor.random.standard_normal([2, 3])
 
-        self.assertRaises(TypeError, test_default_fp_16)
+        self.assertRaises(TypeError, test_default_fp16)
 
-        def test_default_fp_32():
+        def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
             out = paddle.tensor.random.standard_normal([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
 
-        def test_default_fp_64():
+        def test_default_fp64():
             paddle.framework.set_default_dtype('float64')
             out = paddle.tensor.random.standard_normal([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
-        test_default_fp_64()
-        test_default_fp_32()
+        test_default_fp64()
+        test_default_fp32()
 
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
index 654e8d6f129e1ffe0dce59113ca88a16d348f210..a46b9b0ca78bf37e1c421a08a6fa8c5353c6d45d 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -35,24 +35,33 @@ class TestDygraphGroupNormv2(unittest.TestCase):
 
             def compute_v1(x):
                 with fluid.dygraph.guard(p):
-                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    gn = fluid.dygraph.GroupNorm(channels=6, groups=2)
                     y = gn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
             def compute_v2(x):
                 with fluid.dygraph.guard(p):
-                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    gn = paddle.nn.GroupNorm(num_channels=6, num_groups=2)
                     y = gn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
+            def test_weight_bias_false():
+                with fluid.dygraph.guard(p):
+                    gn = paddle.nn.GroupNorm(
+                        num_channels=6,
+                        num_groups=2,
+                        weight_attr=False,
+                        bias_attr=False)
+
             x = np.random.randn(*shape).astype("float32")
             y1 = compute_v1(x)
             y2 = compute_v2(x)
             self.assertTrue(np.allclose(y1, y2))
+            test_weight_bias_false()
 
     def test_static(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+        if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
             places.append(fluid.CUDAPlace(0))
         for p in places:
             exe = fluid.Executor(p)
@@ -60,7 +69,7 @@ class TestDygraphGroupNormv2(unittest.TestCase):
 
             def compute_v1(x_np):
                 with program_guard(Program(), Program()):
-                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    gn = fluid.dygraph.GroupNorm(channels=6, groups=2)
                     x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
                     y = gn(x)
                     exe.run(fluid.default_startup_program())
@@ -69,7 +78,7 @@ class TestDygraphGroupNormv2(unittest.TestCase):
 
             def compute_v2(x_np):
                 with program_guard(Program(), Program()):
-                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    gn = paddle.nn.GroupNorm(num_channels=6, num_groups=2)
                     x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
                     y = gn(x)
                     exe.run(fluid.default_startup_program())
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 74cfeab601b04d9624a5f6e48fd06c6cbf3715f8..22f16287c33f96a43361b5fe4ed5d0fe3edbb1bc 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -652,7 +652,7 @@ class TestDygraphUtils(unittest.TestCase):
         a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
         helper = LayerHelper(fluid.unique_name.generate("test"), act="relu")
         func = helper.append_activation
-        with fluid.dygraph.guard():
+        with fluid.dygraph.guard(fluid.core.CPUPlace()):
             a = fluid.dygraph.to_variable(a_np)
             fluid.set_flags({'FLAGS_use_mkldnn': True})
             try:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
index 820206a3ce630eb92a36a154ca7cdec62de2ce34..13ca1840d0d24c73577a547f186d4f03b13bca28 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
@@ -28,7 +28,7 @@ class TestTracerMode(unittest.TestCase):
     def get_tracer_mode(self):
         assert fluid.in_dygraph_mode(), "Dygraph mode must be enabled"
 
-    @paddle.no_grad()
+    @fluid.dygraph.no_grad
     def no_grad_func(self, a):
         self.assertEqual(self.tracer._train_mode, False)
         return a
@@ -56,35 +56,17 @@ class TestTracerMode(unittest.TestCase):
             def need_no_grad_func(a, b=1):
                 return a + b
 
-            decorated_func = paddle.no_grad()(need_no_grad_func)
+            decorated_func = fluid.dygraph.no_grad(need_no_grad_func)
             self.assertTrue(
                 str(inspect.getargspec(decorated_func)) ==
                 str(inspect.getargspec(need_no_grad_func)))
 
             self.assertEqual(self.tracer._train_mode, self.init_mode)
 
-            def test_gen():
-                for i in range(3):
-                    yield i
-
-            a = 0
-            for i in test_gen():
-                a += i
-
-            @paddle.no_grad()
-            def test_wrapped_gen():
-                for i in range(3):
-                    yield i
-
-            b = 0
-            for i in test_wrapped_gen():
-                b += i
-
-            self.assertEqual(a, b)
-
         with fluid.dygraph.guard():
             self.check_not_support_rlt(False)
 
+        paddle.enable_static()
         with new_program_scope():
             self.check_not_support_rlt(True)
 
@@ -94,5 +76,48 @@ class TestTracerMode2(TestTracerMode):
         self.init_mode = False
 
 
+class TestNoGradClass(unittest.TestCase):
+    @paddle.no_grad()
+    def no_grad_func(self, a):
+        self.assertEqual(self.tracer._train_mode, False)
+        return a
+
+    def test_main(self):
+        paddle.disable_static()
+
+        self.tracer = framework._dygraph_tracer()
+        self.tracer._train_mode = True
+
+        self.assertEqual(self.no_grad_func(1), 1)
+        self.assertEqual(self.no_grad_func.__name__, "no_grad_func")
+
+        def need_no_grad_func(a, b=1):
+            return a + b
+
+        decorated_func = paddle.no_grad()(need_no_grad_func)
+        self.assertEqual(
+            str(inspect.getargspec(decorated_func)),
+            str(inspect.getargspec(need_no_grad_func)))
+
+        def test_gen():
+            for i in range(3):
+                yield i
+
+        a = 0
+        for i in test_gen():
+            a += i
+
+        @paddle.no_grad()
+        def test_wrapped_gen():
+            for i in range(3):
+                yield i
+
+        b = 0
+        for i in test_wrapped_gen():
+            b += i
+
+        self.assertEqual(a, b)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
index 619e9e8e90783365b5f0d718783a14468520c8d4..887e50f07c55cc991d7816609253039ce0d48d7d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -401,9 +401,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
             a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
 
             linear = fluid.dygraph.nn.Linear(10, 10)
-
             a = fluid.dygraph.to_variable(a)
-
             b = linear(a)
 
             loss = fluid.layers.reduce_mean(b)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index 48aea3a584dd25667704b22d99d1074c481bb76c..22e19efcb58d19c41835565de2c8c01fe253702a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -374,8 +374,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 adam._learning_rate.step_num = 0
 
             para_state_dict, opti_state_dict = paddle.load("./test_dy")
-            print(opti_state_dict['LR_Scheduler'])
-            adam.set_dict(opti_state_dict)
+            adam.set_state_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
@@ -393,7 +392,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 var.set(np.zeros_like(np_t), place)
 
-            ptb_model.set_dict(para_state_dict)
+            ptb_model.set_state_dict(stat_dict=para_state_dict)
 
             state_dict = ptb_model.state_dict()
 
@@ -483,7 +482,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             if isinstance(adam._learning_rate, LearningRateDecay):
                 adam._learning_rate.step_num = 0
 
-            adam.set_dict(self.opti_dict)
+            adam.set_state_dict(self.opti_dict)
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
                 if isinstance(v, core.VarBase):
@@ -500,7 +499,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 var.set(np.zeros_like(np_t), place)
 
-            ptb_model.set_dict(self.state_dict)
+            ptb_model.set_state_dict(self.state_dict)
 
             state_dict = ptb_model.state_dict()
 
@@ -593,7 +592,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             if isinstance(adam._learning_rate, LearningRateDecay):
                 adam._learning_rate.step_num = 0
 
-            adam.set_dict(np_opti_dict)
+            adam.set_state_dict(np_opti_dict)
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
@@ -613,7 +612,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 var.set(np.zeros_like(np_t), place)
 
-            ptb_model.set_dict(np_state_dict)
+            ptb_model.set_state_dict(np_state_dict)
 
             state_dict = ptb_model.state_dict()
 
@@ -656,8 +655,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
             last_hidden = None
             last_cell = None
 
-            adam.set_dict(self.opti_dict)
-            ptb_model.set_dict(self.state_dict)
+            adam.set_state_dict(self.opti_dict)
+            ptb_model.set_state_dict(self.state_dict)
 
             for i in range(1):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -745,8 +744,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
             last_cell = None
 
             state_dict, opti_dict = fluid.load_dygraph("./test_dy")
-            adam.set_dict(opti_dict)
-            ptb_model.set_dict(state_dict)
+            adam.set_state_dict(opti_dict)
+            ptb_model.set_state_dict(state_dict)
 
             for i in range(1):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -849,8 +848,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
             for k, v in self.state_dict.items():
                 np_state_dict[k] = v.numpy()
 
-            adam.set_dict(np_opti_dict)
-            ptb_model.set_dict(np_state_dict)
+            adam.set_state_dict(np_opti_dict)
+            ptb_model.set_state_dict(np_state_dict)
             for i in range(1):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
                 y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
@@ -912,6 +911,22 @@ class TestDygraphPtbRnn(unittest.TestCase):
             para_state_dict, opti_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdopt'))
 
+    def test_load_compatible_with_keep_name_table(self):
+        with fluid.dygraph.guard():
+            emb = fluid.dygraph.Embedding([10, 10])
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy'), True)
+            self.assertTrue(para_state_dict != None)
+            self.assertTrue(opti_state_dict == None)
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy'), keep_name_table=True)
+            self.assertTrue(para_state_dict != None)
+            self.assertTrue(opti_state_dict == None)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index e81d1c8610f6bebffadf930b67dc14a4a418ef05..3eb413a62664057c56567d5834b216110fac04fb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -297,6 +297,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             paddle.save(self.state_dict, "./test_dy_v2")
 
     def testLoadAndSetVarBase(self):
+        self.setUp()
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -917,6 +918,29 @@ class TestDygraphPtbRnn(unittest.TestCase):
             para_state_dict, opti_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdopt'))
 
+    def test_no_state_in_input_dict(self):
+        with fluid.dygraph.guard():
+            emb = fluid.dygraph.Embedding([10, 10])
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+
+            para_state_dict, _ = paddle.load(os.path.join('saved_dy', 'emb_dy'))
+            para_state_dict.pop('weight')
+
+            emb.set_state_dict(para_state_dict)
+
+    def test_state_shape_mismatch(self):
+        with fluid.dygraph.guard():
+            emb = fluid.dygraph.Embedding([10, 10])
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+
+            para_state_dict, _ = paddle.load(os.path.join('saved_dy', 'emb_dy'))
+            para_state_dict['weight'] = np.expand_dims(
+                para_state_dict['weight'], axis=-1)
+
+            emb.set_state_dict(para_state_dict)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
index b02ba1a584b52dbbc99fcc8ed7bad438e7a9dd46..c45c144e3ad44c5781ea1f1d7d61028b56d8a254 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -48,7 +48,13 @@ class TestInstanceNorm(unittest.TestCase):
                 instance_norm3d = paddle.nn.BatchNorm3d(1)
                 instance_norm3d(fluid.dygraph.to_variable(x_data_4))
 
+            def weight_bias_false():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm3d = paddle.nn.BatchNorm3d(
+                    1, weight_attr=False, bias_attr=False)
+
             with fluid.dygraph.guard(p):
+                weight_bias_false()
                 self.assertRaises(ValueError, error1d)
                 self.assertRaises(ValueError, error2d)
                 self.assertRaises(ValueError, error3d)
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 87b6e76a6d0ab7f5fba7c4526734d81475e1540e..f7fcc1ff561b90dc1b78a67ffbe7c047ed06d0e9 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -183,25 +183,6 @@ class TestJitSaveLoad(unittest.TestCase):
         with self.assertRaises(ValueError):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
-    def test_load_dygraph_no_var_info(self):
-        model_path = "model.test_jit_save_load.no_var_info"
-        self.train_and_save_model(model_path=model_path)
-        # remove `__variables.info__`
-        var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
-        os.remove(var_info_path)
-        new_layer = LinearNet(784, 1)
-        with self.assertRaises(RuntimeError):
-            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
-
-    def test_load_dygraph_not_var_file(self):
-        model_path = "model.test_jit_save_load.no_var_file"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.params_filename = "__params__"
-        self.train_and_save_model(model_path=model_path, configs=configs)
-        new_layer = LinearNet(784, 1)
-        with self.assertRaises(RuntimeError):
-            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
-
 
 class LinearNetMultiInput(fluid.dygraph.Layer):
     def __init__(self, in_size, out_size):
diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py
index 6d1f42111eebff0f469317ddf2a9ec7698a7ae1e..03cb84ec99e0259a33a086c3d3e5a71abea09d2b 100644
--- a/python/paddle/fluid/tests/unittests/test_linspace.py
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
@@ -154,16 +154,16 @@ class TestLinspaceOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_step_dtype)
 
             def test_start_dtype():
-                start = fluid.data(shape=[1], dtype="int32", name="start")
+                start = fluid.data(shape=[1], dtype="float64", name="start")
                 fluid.layers.linspace(start, 10, 1, dtype="float32")
 
-            self.assertRaises(TypeError, test_start_dtype)
+            self.assertRaises(ValueError, test_start_dtype)
 
             def test_end_dtype():
-                end = fluid.data(shape=[1], dtype="int32", name="end")
+                end = fluid.data(shape=[1], dtype="float64", name="end")
                 fluid.layers.linspace(0, end, 1, dtype="float32")
 
-            self.assertRaises(TypeError, test_end_dtype)
+            self.assertRaises(ValueError, test_end_dtype)
 
             def test_num_dtype():
                 num = fluid.data(shape=[1], dtype="int32", name="step")
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed1939dbe279f28883d9e33178f1cfa256140e33
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import six
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from test_imperative_base import new_program_scope
+
+
+def convolutional_neural_network(img):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return prediction
+
+
+def static_train_net(img, label):
+    prediction = convolutional_neural_network(img)
+
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+
+    optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    optimizer.minimize(avg_loss)
+
+    return prediction, avg_loss
+
+
+class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
+    def setUp(self):
+        self.seed = 90
+        self.epoch_num = 1
+        self.batch_size = 128
+        self.batch_num = 10
+
+    def train_and_save_model(self):
+        with new_program_scope():
+            startup_program = fluid.default_startup_program()
+            main_program = fluid.default_main_program()
+
+            img = fluid.data(
+                name='img', shape=[None, 1, 28, 28], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+
+            prediction, avg_loss = static_train_net(img, label)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+
+            exe = fluid.Executor(place)
+
+            feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+            exe.run(startup_program)
+
+            train_reader = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.mnist.train(), buf_size=100),
+                batch_size=self.batch_size)
+
+            for _ in range(0, self.epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    exe.run(main_program,
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_loss])
+
+                    if batch_id > self.batch_num:
+                        break
+
+            static_param_dict = {}
+            for param in fluid.default_main_program().all_parameters():
+                static_param_dict[param.name] = fluid.executor._fetch_var(
+                    param.name)
+
+            fluid.io.save_inference_model(
+                self.save_dirname, ["img"], [prediction],
+                exe,
+                model_filename=self.model_filename,
+                params_filename=self.params_filename)
+
+        return static_param_dict
+
+    def check_load_state_dict(self, orig_dict, load_dict):
+        for var_name, value in six.iteritems(orig_dict):
+            self.assertTrue(np.array_equal(value, load_dict[var_name]))
+
+    def test_load_default(self):
+        self.save_dirname = "static_mnist.load_state_dict.default"
+        self.model_filename = None
+        self.params_filename = None
+        orig_param_dict = self.train_and_save_model()
+
+        configs = paddle.SaveLoadConfig()
+        configs.separate_params = True
+        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
+    def test_load_with_model_filename(self):
+        self.save_dirname = "static_mnist.load_state_dict.model_filename"
+        self.model_filename = "static_mnist.model"
+        self.params_filename = None
+        orig_param_dict = self.train_and_save_model()
+
+        configs = paddle.SaveLoadConfig()
+        configs.separate_params = True
+        configs.model_filename = self.model_filename
+        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
+    def test_load_with_param_filename(self):
+        self.save_dirname = "static_mnist.load_state_dict.param_filename"
+        self.model_filename = None
+        self.params_filename = "static_mnist.params"
+        orig_param_dict = self.train_and_save_model()
+
+        configs = paddle.SaveLoadConfig()
+        configs.params_filename = self.params_filename
+        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
+    def test_load_with_model_and_param_filename(self):
+        self.save_dirname = "static_mnist.load_state_dict.model_and_param_filename"
+        self.model_filename = "static_mnist.model"
+        self.params_filename = "static_mnist.params"
+        orig_param_dict = self.train_and_save_model()
+
+        configs = paddle.SaveLoadConfig()
+        configs.params_filename = self.params_filename
+        configs.model_filename = self.model_filename
+        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
index 98d8b7f9f88d2f8892bb2ac8190fbb3c9f19e047..44a653521a9c4878f6135c7f78f4e779c929e7d3 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
@@ -59,7 +59,7 @@ class TestLookupTableOpWithTensorIds(OpTest):
     def setUp(self):
         self.op_type = "lookup_table_v2"
         table = np.random.random((17, 31)).astype("float64")
-        ids = np.random.randint(low=0, high=17, size=(2, 4, 5)).astype("int64")
+        ids = np.random.randint(low=0, high=17, size=(2, 4, 5)).astype("int32")
         self.inputs = {'W': table, 'Ids': ids}
         self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
 
@@ -100,7 +100,7 @@ class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
 class TestLookupTableWIsSelectedRows(unittest.TestCase):
     def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.array([0, 4, 3, 5]).astype("int64")
+        ids_array = np.array([0, 4, 3, 5]).astype("int32")
         ids_tensor.set(ids_array, place)
         return ids_array
 
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0edf9019356f38eb3c74b9cadfa6ae575e9b823
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+
+class EmbeddingDygraph(unittest.TestCase):
+    def test_1(self):
+        import paddle
+        import paddle.nn as nn
+        import numpy as np
+        paddle.disable_static()
+
+        # example 1
+        inp_word = np.array([[2, 3, 5], [4, 2, 1]]).astype('int64')
+        inp_word.shape  # [2, 3]
+        dict_size = 20
+
+        emb = nn.Embedding(dict_size, 32, weight_attr='emb.w', sparse=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9c91ceb39de42c44f9ce81658aa79b896999552
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle.nn.functional as functional
+
+
+class EmbeddingStatic(unittest.TestCase):
+    def test_1(self):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+
+            def test_bad_x():
+                initializer = fluid.initializer.NumpyArrayInitializer(
+                    np.random.random(size=(128, 100)))
+
+                param_attr = fluid.ParamAttr(
+                    name="emb_weight",
+                    learning_rate=0.5,
+                    initializer=initializer,
+                    trainable=True)
+
+                weight = prog.global_block().create_parameter(
+                    (128, 100), attr=param_attr, dtype="float32")
+
+                label = fluid.layers.data(
+                    name="label",
+                    shape=[4],
+                    append_batch_size=False,
+                    dtype="int64")
+
+                emb = functional.embedding(
+                    x=label, weight=weight, sparse=True, name="embedding")
+
+            test_bad_x()
+
+    def test_2(self):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+
+            def test_bad_x():
+                initializer = fluid.initializer.NumpyArrayInitializer(
+                    np.random.random(size=(128, 100)))
+
+                param_attr = fluid.ParamAttr(
+                    name="emb_weight",
+                    learning_rate=0.5,
+                    initializer=initializer,
+                    trainable=True)
+
+                weight = prog.global_block().create_parameter(
+                    (128, 100), attr=param_attr, dtype="float32")
+
+                label = fluid.layers.data(
+                    name="label",
+                    shape=[4],
+                    append_batch_size=False,
+                    dtype="int32")
+
+                emb = functional.embedding(
+                    x=label, weight=weight, sparse=True, name="embedding")
+
+            test_bad_x()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_normal.py b/python/paddle/fluid/tests/unittests/test_normal.py
index 3e6855feaf491727203063f5c75c68301abbe05e..995a1f26ff6eb86c9198a164bcef80bebe3a8e79 100644
--- a/python/paddle/fluid/tests/unittests/test_normal.py
+++ b/python/paddle/fluid/tests/unittests/test_normal.py
@@ -18,6 +18,7 @@ import paddle
 import copy
 
 np.random.seed(10)
+paddle.manual_seed(10)
 
 
 class TestNormalAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index 2e6e516aa2edde79e6524b4b35507ea95876ec53..91d705223316360b8c05954259724a5f7d246440 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -832,8 +832,8 @@ class TestRecomputeOptimizer(unittest.TestCase):
         recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
         recompute_optimizer._set_checkpoints([b1_out])
         try:
-            stat_dict = {}
-            recompute_optimizer.load(stat_dict)
+            state_dict = {}
+            recompute_optimizer.load(state_dict)
         except NotImplementedError as e:
             self.assertEqual(
                 "load function is not supported by Recompute Optimizer for now",
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index bac196b1ab52b604a85321a5473d455d2616bf0d..9cc507aa9b7918e854d56f1c8482f1b875910fb4 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -47,5 +47,21 @@ class TestParallelDygraphMnistSpawn(TestDistSpawnRunner):
             self.check_dist_result_with_spawn(test_class=TestMnist, delta=1e-5)
 
 
+class TestFleetDygraphMnist(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._gpu_fleet_api = True
+
+    def test_mnist(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_mnist.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index 1c05b96f1fc61234028e940f6403ae08a0186027..25216175d59935535a352b02afc3c8f371cedd63 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -143,6 +143,27 @@ class TestPool1d_API(unittest.TestCase):
             result = avg_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_avg_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.avg_pool1d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=[1],
+                count_include_pad=True)
+
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[1], exclusive=False)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool1d_dg = paddle.nn.AvgPool1d(
+                kernel_size=2, stride=None, padding=1, count_include_pad=True)
+            result = avg_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
index 93a2be6de342efc4e8284e7c352137d0a3a1bcb9..91faf78418b0d3a92a3cb6a167b6024b1beb3898 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from test_pool2d_op import adaptive_start_index, adaptive_end_index, pool2D_forward_naive
+from test_pool2d_op import adaptive_start_index, adaptive_end_index, pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive
 import unittest
 from op_test import OpTest
 import numpy as np
@@ -68,6 +68,47 @@ class TestPool2d_API(unittest.TestCase):
             result = avg_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_avg_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool2d(
+                input, kernel_size=2, stride=2, padding=1, ceil_mode=False)
+
+            result_np = avg_pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[1, 1],
+                ceil_mode=False,
+                exclusive=False)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=1, ceil_mode=False)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_dygraph_ceilmode_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool2d(
+                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+            result_np = avg_pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                ceil_mode=True)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=0, ceil_mode=True)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(
@@ -108,6 +149,70 @@ class TestPool2d_API(unittest.TestCase):
             result = max_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_max_dygraph_nhwc_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(
+                np.transpose(input_np, [0, 2, 3, 1]))
+            result = max_pool2d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                return_indices=False,
+                data_format="NHWC")
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max')
+            self.assertTrue(
+                np.allclose(
+                    np.transpose(result.numpy(), [0, 3, 1, 2]), result_np))
+
+    def check_max_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool2d(
+                input, kernel_size=2, stride=2, padding=1, ceil_mode=False)
+
+            result_np = max_pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[1, 1],
+                ceil_mode=False,
+                exclusive=False)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=1, ceil_mode=False)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_ceilmode_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool2d(
+                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+            result_np = max_pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                ceil_mode=True)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=0, ceil_mode=True)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_dygraph_stride_is_none(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
@@ -215,6 +320,9 @@ class TestPool2d_API(unittest.TestCase):
             self.check_avg_dygraph_stride_is_none(place)
             self.check_max_dygraph_padding(place)
             self.check_avg_divisor(place)
+            self.check_max_dygraph_padding_results(place)
+            self.check_max_dygraph_ceilmode_results(place)
+            self.check_max_dygraph_nhwc_results(place)
 
 
 class TestPool2dError_API(unittest.TestCase):
@@ -370,6 +478,22 @@ class TestPool2dError_API(unittest.TestCase):
 
         self.assertRaises(ValueError, run8)
 
+        def run9():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    ceil_mode=False,
+                    data_format='NHWC',
+                    return_indices=True)
+
+        self.assertRaises(ValueError, run9)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
index cc078e9aae7aafe55e937b80270dd012fd64ff70..a77f1cdd57d7bade92e2a4f914dc3d91624d4845 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -22,7 +22,7 @@ import paddle.fluid.core as core
 from op_test import OpTest
 import paddle.fluid as fluid
 from paddle.nn.functional import avg_pool3d, max_pool3d
-from test_pool3d_op import adaptive_start_index, adaptive_end_index, pool3D_forward_naive
+from test_pool3d_op import adaptive_start_index, adaptive_end_index, pool3D_forward_naive, avg_pool3D_forward_naive, max_pool3D_forward_naive
 
 
 class TestPool3d_API(unittest.TestCase):
@@ -73,6 +73,58 @@ class TestPool3d_API(unittest.TestCase):
             result = avg_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_avg_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool3d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=1,
+                ceil_mode=False,
+                count_include_pad=True)
+
+            result_np = avg_pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[1, 1, 1],
+                ceil_mode=False,
+                exclusive=False)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+                kernel_size=2,
+                stride=None,
+                padding=1,
+                ceil_mode=False,
+                count_include_pad=True)
+            result = avg_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_dygraph_ceilmode_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool3d(
+                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+            result_np = avg_pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                ceil_mode=True)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+                kernel_size=2, stride=None, padding=0, ceil_mode=True)
+            result = avg_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(
@@ -112,6 +164,74 @@ class TestPool3d_API(unittest.TestCase):
             result = max_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_max_dygraph_ndhwc_results(self, place):
+        print("run ndchw max pool3d")
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(
+                np.transpose(input_np, [0, 2, 3, 4, 1]))
+            result = max_pool3d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                data_format="NDHWC",
+                return_indices=False)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max')
+
+            self.assertTrue(
+                np.allclose(
+                    np.transpose(result.numpy(), [0, 4, 1, 2, 3]), result_np))
+
+    def check_max_dygraph_ceilmode_results(self, place):
+        print("run ceil mode max pool3d")
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool3d(
+                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+            result_np = max_pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                ceil_mode=True)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=None, padding=0, ceil_mode=True)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool3d(
+                input, kernel_size=2, stride=2, padding=1, ceil_mode=False)
+
+            result_np = max_pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[1, 1, 1],
+                ceil_mode=False)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=None, padding=1, ceil_mode=False)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_dygraph_stride_is_none(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
@@ -205,6 +325,8 @@ class TestPool3d_API(unittest.TestCase):
             self.check_max_dygraph_stride_is_none(place)
             self.check_max_dygraph_padding(place)
             self.check_avg_divisor(place)
+            self.check_max_dygraph_ndhwc_results(place)
+            self.check_max_dygraph_ceilmode_results(place)
 
 
 class TestPool3dError_API(unittest.TestCase):
@@ -336,6 +458,21 @@ class TestPool3dError_API(unittest.TestCase):
 
         self.assertRaises(ValueError, run9)
 
+        def run10():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    data_format='NDHWC',
+                    return_indices=True)
+
+        self.assertRaises(ValueError, run10)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rand_op.py b/python/paddle/fluid/tests/unittests/test_rand_op.py
index 1eceeaadfec651ade5031ddc7e6a012244050e84..4b8fe8c7e4786417de2f80dbb9953530781f9189 100644
--- a/python/paddle/fluid/tests/unittests/test_rand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rand_op.py
@@ -120,24 +120,24 @@ class TestRandDtype(unittest.TestCase):
     def test_default_dtype(self):
         paddle.disable_static()
 
-        def test_default_fp_16():
+        def test_default_fp16():
             paddle.framework.set_default_dtype('float16')
             paddle.tensor.random.rand([2, 3])
 
-        self.assertRaises(TypeError, test_default_fp_16)
+        self.assertRaises(TypeError, test_default_fp16)
 
-        def test_default_fp_32():
+        def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
             out = paddle.tensor.random.rand([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
 
-        def test_default_fp_64():
+        def test_default_fp64():
             paddle.framework.set_default_dtype('float64')
             out = paddle.tensor.random.rand([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
-        test_default_fp_64()
-        test_default_fp_32()
+        test_default_fp64()
+        test_default_fp32()
 
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py
index 88b07f5df83f8f967f8ba76e78b37ecfb2c54276..7880b48cd7d5a006d78b836be3d9d2f0b1e04c5e 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
@@ -58,6 +58,11 @@ class TestRandintOpError(unittest.TestCase):
             self.assertRaises(TypeError, paddle.randint, 5, dtype='float32')
             self.assertRaises(ValueError, paddle.randint, 5, 5)
             self.assertRaises(ValueError, paddle.randint, -5)
+            self.assertRaises(TypeError, paddle.randint, 5, shape=['2'])
+            shape_tensor = paddle.static.data('X', [1])
+            self.assertRaises(TypeError, paddle.randint, 5, shape=shape_tensor)
+            self.assertRaises(
+                TypeError, paddle.randint, 5, shape=[shape_tensor])
 
 
 class TestRandintOp_attr_tensorlist(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index cf35f9dbcdaaae1357ccdfd6b5cba85ac98d2037..b0b85f633a2bf613cdbdcc2ba7b31b5d970da8ca 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -475,87 +475,71 @@ class API_TestSumOpError(unittest.TestCase):
     def test_errors(self):
         def test_dtype1():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                paddle.sum(data, dtype="int32")
+                data = fluid.data(name="data", shape=[10], dtype="float64")
+                paddle.sum(data, dtype="float32")
 
         self.assertRaises(ValueError, test_dtype1)
 
         def test_dtype2():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                paddle.sum(data, dtype="float32")
+                data = fluid.data(name="data", shape=[10], dtype="int64")
+                paddle.sum(data, dtype="int32")
 
         self.assertRaises(ValueError, test_dtype2)
 
         def test_dtype3():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="int32")
-                paddle.sum(data, dtype="bool")
+                data = fluid.data(name="data", shape=[10], dtype="float64")
+                paddle.sum(data, dtype="int32")
 
         self.assertRaises(ValueError, test_dtype3)
 
-        def test_dtype4():
+        def test_type():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 data = fluid.data(name="data", shape=[10], dtype="int32")
-                paddle.sum(data, dtype="int32")
+                paddle.sum(data, dtype="bool")
 
-        self.assertRaises(ValueError, test_dtype3)
+        self.assertRaises(TypeError, test_type)
 
 
 class API_TestSumOp(unittest.TestCase):
-    def test_static(self):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="float32")
-            result_sum = paddle.sum(x=data, axis=1, dtype="float64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.rand(10, 10).astype(np.float32)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
-        self.assertEqual(
-            (res == np.sum(input_data.astype(np.float64), axis=1)).all(), True)
+    def run_static(self,
+                   shape,
+                   x_dtype,
+                   attr_axis,
+                   attr_dtype=None,
+                   np_axis=None):
+        if np_axis is None:
+            np_axis = attr_axis
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(x=data, axis=1, dtype="int64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
-        self.assertEqual(
-            (res == np.sum(input_data.astype(np.int64), axis=1)).all(), True)
+            data = fluid.data("data", shape=shape, dtype=x_dtype)
+            result_sum = paddle.sum(x=data, axis=attr_axis, dtype=attr_dtype)
 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(x=data, axis=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
+            exe = fluid.Executor(fluid.CPUPlace())
+            input_data = np.random.rand(*shape).astype(x_dtype)
             res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
-        self.assertEqual((res == np.sum(input_data, axis=1)).all(), True)
 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(x=data, axis=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
-        self.assertEqual((res == np.sum(input_data, axis=1)).all(), True)
+        self.assertTrue(
+            np.allclose(
+                res, np.sum(input_data.astype(attr_dtype), axis=np_axis)))
 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input_data = np.random.randint(10, size=(5, 5, 5)).astype(np.int32)
-            data = fluid.data("data", shape=[5, 5, 5], dtype="int32")
-            sum1 = paddle.sum(x=data, axis=[0, 1])
-            sum2 = paddle.sum(x=data, axis=())
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            res1, res2 = exe.run(feed={"data": input_data},
-                                 fetch_list=[sum1, sum2])
-
-        self.assertEqual((res1 == np.sum(input_data, axis=(0, 1))).all(), True)
-        self.assertEqual(
-            (res2 == np.sum(input_data, axis=(0, 1, 2))).all(), True)
+    def test_static(self):
+        shape = [10, 10]
+        axis = 1
+
+        self.run_static(shape, "int32", axis, attr_dtype=None)
+        self.run_static(shape, "int32", axis, attr_dtype="int32")
+        self.run_static(shape, "int32", axis, attr_dtype="int64")
+
+        self.run_static(shape, "float32", axis, attr_dtype=None)
+        self.run_static(shape, "float32", axis, attr_dtype="float32")
+        self.run_static(shape, "float32", axis, attr_dtype="float64")
+
+        shape = [5, 5, 5]
+        self.run_static(shape, "int32", (0, 1), attr_dtype="int32")
+        self.run_static(
+            shape, "int32", (), attr_dtype="int32", np_axis=(0, 1, 2))
 
     def test_dygraph(self):
         np_x = np.random.random([2, 3, 4]).astype('int32')
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index 44087c5421a5ee66273ef35b935926d42dcc37ae..167a8a017c24a01a6475a03835222d33c601396e 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -106,9 +106,9 @@ def bow_net(data,
             label,
             dict_dim,
             is_sparse=False,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
+            emb_dim=8,
+            hid_dim=8,
+            hid_dim2=6,
             class_dim=2):
     """
     BOW net
@@ -132,8 +132,8 @@ class TestRegularizer(unittest.TestCase):
     def setUp(self):
         self.word_dict = paddle.dataset.imdb.word_dict()
         reader = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict), batch_size=8)()
-        self.train_data = [next(reader) for _ in range(5)]
+            paddle.dataset.imdb.train(self.word_dict), batch_size=1)()
+        self.train_data = [next(reader) for _ in range(1)]
 
     def get_places(self):
         places = [core.CPUPlace()]
@@ -245,14 +245,14 @@ class TestRegularizer(unittest.TestCase):
             sgd.minimize(loss)
         with fluid.dygraph.guard():
             input = fluid.dygraph.to_variable(
-                np.random.randn(3, 5).astype('float32'))
+                np.random.randn(3, 2).astype('float32'))
             paddle.manual_seed(1)
             paddle.framework.random._manual_program_seed(1)
 
             linear1 = fluid.dygraph.Linear(
-                5, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
             linear2 = fluid.dygraph.Linear(
-                5, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
 
             loss1 = linear1(input)
             loss1.backward()
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index f7b9d4214d36a422a3ec94dc410e58c6c827ef4c..ddac7f6b98b19d204d20ccdff75c6d4fcae50d4d 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -276,6 +276,19 @@ class TestRMSPropV2(unittest.TestCase):
             learning_rate=0.1,
             momentum=None)
 
+    def test_rmsprop_op_invalid_input(self):
+        paddle.disable_static()
+        linear = paddle.nn.Linear(10, 10)
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.RMSProp(
+                0.1, epsilon=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.RMSProp(
+                0.1, momentum=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.RMSProp(
+                0.1, rho=-1, parameters=linear.parameters())
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_stack_op.py b/python/paddle/fluid/tests/unittests/test_stack_op.py
index fd5c02c55db4c22d9edd604b7998a5405961d596..8dd71c5a558094ce6f259105eeb1aafb834ad6dc 100644
--- a/python/paddle/fluid/tests/unittests/test_stack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_stack_op.py
@@ -182,6 +182,11 @@ class API_test(unittest.TestCase):
             expected_result = np.stack([input1, input2, input3], axis=0)
             self.assertTrue(np.allclose(expected_result, result))
 
+    def test_single_tensor_error(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = paddle.rand([2, 3])
+            self.assertRaises(TypeError, paddle.stack, x)
+
 
 class API_DygraphTest(unittest.TestCase):
     def test_out(self):
@@ -192,18 +197,23 @@ class API_DygraphTest(unittest.TestCase):
             x1 = fluid.dygraph.to_variable(data1)
             x2 = fluid.dygraph.to_variable(data2)
             x3 = fluid.dygraph.to_variable(data3)
-            result = paddle.stack([x1, x2, x3], axis=0)
+            result = paddle.stack([x1, x2, x3])
             result_np = result.numpy()
-        expected_result = np.stack([data1, data2, data3], axis=0)
+        expected_result = np.stack([data1, data2, data3])
         self.assertTrue(np.allclose(expected_result, result_np))
 
         with fluid.dygraph.guard():
             y1 = fluid.dygraph.to_variable(data1)
-            result = paddle.stack(y1, axis=0)
+            result = paddle.stack([y1], axis=0)
             result_np_2 = result.numpy()
-        expected_result_2 = np.stack(data1, axis=0)
+        expected_result_2 = np.stack([data1], axis=0)
         self.assertTrue(np.allclose(expected_result_2, result_np_2))
 
+    def test_single_tensor_error(self):
+        with fluid.dygraph.guard():
+            x = paddle.to_tensor([1, 2, 3])
+            self.assertRaises(Exception, paddle.stack, x)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_translated_layer.py b/python/paddle/fluid/tests/unittests/test_translated_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..20c51b9afbafac9ba1fa032aea446383bc2b9796
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_translated_layer.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.optimizer as opt
+
+BATCH_SIZE = 16
+BATCH_NUM = 4
+EPOCH_NUM = 4
+SEED = 10
+
+IMAGE_SIZE = 784
+CLASS_NUM = 10
+
+
+# define a random dataset
+class RandomDataset(paddle.io.Dataset):
+    def __init__(self, num_samples):
+        self.num_samples = num_samples
+
+    def __getitem__(self, idx):
+        np.random.seed(SEED)
+        image = np.random.random([IMAGE_SIZE]).astype('float32')
+        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+        return image, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class LinearNet(nn.Layer):
+    def __init__(self):
+        super(LinearNet, self).__init__()
+        self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        return self._linear(x)
+
+
+def train(layer, loader, loss_fn, opt):
+    for epoch_id in range(EPOCH_NUM):
+        for batch_id, (image, label) in enumerate(loader()):
+            out = layer(image)
+            loss = loss_fn(out, label)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+            print("Epoch {} batch {}: loss = {}".format(epoch_id, batch_id,
+                                                        np.mean(loss.numpy())))
+    return loss
+
+
+class TestTranslatedLayer(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        place = paddle.CPUPlace()
+        paddle.disable_static(place)
+
+        # config seed
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+        # create network
+        self.layer = LinearNet()
+        self.loss_fn = nn.CrossEntropyLoss()
+        self.sgd = opt.SGD(learning_rate=0.001,
+                           parameters=self.layer.parameters())
+
+        # create data loader
+        dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+        self.loader = paddle.io.DataLoader(
+            dataset,
+            places=place,
+            batch_size=BATCH_SIZE,
+            shuffle=True,
+            drop_last=True,
+            num_workers=2)
+
+        # train
+        train(self.layer, self.loader, self.loss_fn, self.sgd)
+
+        # save
+        self.model_path = "linear.example.model"
+        paddle.jit.save(self.layer, self.model_path)
+
+    def test_inference_and_fine_tuning(self):
+        self.load_and_inference()
+        self.load_and_fine_tuning()
+
+    def load_and_inference(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        # inference
+        x = paddle.randn([1, IMAGE_SIZE], 'float32')
+
+        self.layer.eval()
+        orig_pred = self.layer(x)
+
+        translated_layer.eval()
+        pred = translated_layer(x)
+
+        self.assertTrue(np.array_equal(orig_pred.numpy(), pred.numpy()))
+
+    def load_and_fine_tuning(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        # train original layer continue
+        self.layer.train()
+        orig_loss = train(self.layer, self.loader, self.loss_fn, self.sgd)
+
+        # fine-tuning
+        translated_layer.train()
+        sgd = opt.SGD(learning_rate=0.001,
+                      parameters=translated_layer.parameters())
+        loss = train(translated_layer, self.loader, self.loss_fn, sgd)
+
+        self.assertTrue(
+            np.array_equal(orig_loss.numpy(), loss.numpy()),
+            msg="original loss:\n{}\nnew loss:\n{}\n".format(orig_loss.numpy(),
+                                                             loss.numpy()))
+
+    def test_get_program(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        program = translated_layer.program()
+        self.assertTrue(isinstance(program, paddle.static.Program))
+
+    def test_get_program_method_not_exists(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        with self.assertRaises(ValueError):
+            program = translated_layer.program('not_exists')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 56dc27a9a5b136829ce410b50998e23b77510665..5ecf25c53b794f07e298b986eff5700698b8bff7 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -239,12 +239,12 @@ class TestUniformRandomOpSelectedRows(unittest.TestCase):
         op = Operator(
             "uniform_random",
             Out="X",
-            shape=[100, 784],
+            shape=[1000, 784],
             min=-5.0,
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [100, 784])
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -260,15 +260,15 @@ class TestUniformRandomOpSelectedRowsWithDiagInit(
         op = Operator(
             "uniform_random",
             Out="X",
-            shape=[100, 784],
+            shape=[500, 784],
             min=-5.0,
             max=10.0,
             seed=10,
-            diag_num=100,
+            diag_num=500,
             diag_step=784,
             diag_val=1.0)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [100, 784])
+        self.assertEqual(out.get_tensor().shape(), [500, 784])
         hist, prob = output_hist_diag(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -391,7 +391,7 @@ class TestUniformRandomOpSelectedRowsShapeTensor(unittest.TestCase):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         shape_tensor = scope.var("Shape").get_tensor()
-        shape_tensor.set(np.array([100, 784]).astype("int64"), place)
+        shape_tensor.set(np.array([1000, 784]).astype("int64"), place)
         paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
@@ -401,7 +401,7 @@ class TestUniformRandomOpSelectedRowsShapeTensor(unittest.TestCase):
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [100, 784])
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -423,7 +423,7 @@ class TestUniformRandomOpSelectedRowsShapeTensorList(unittest.TestCase):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         shape_1 = scope.var("shape1").get_tensor()
-        shape_1.set(np.array([100]).astype("int64"), place)
+        shape_1.set(np.array([1000]).astype("int64"), place)
         shape_2 = scope.var("shape2").get_tensor()
         shape_2.set(np.array([784]).astype("int64"), place)
         paddle.manual_seed(10)
@@ -435,7 +435,7 @@ class TestUniformRandomOpSelectedRowsShapeTensorList(unittest.TestCase):
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [100, 784])
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -540,24 +540,24 @@ class TestUniformDtype(unittest.TestCase):
     def test_default_dtype(self):
         paddle.disable_static()
 
-        def test_default_fp_16():
+        def test_default_fp16():
             paddle.framework.set_default_dtype('float16')
             paddle.tensor.random.uniform([2, 3])
 
-        self.assertRaises(TypeError, test_default_fp_16)
+        self.assertRaises(TypeError, test_default_fp16)
 
-        def test_default_fp_32():
+        def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
             out = paddle.tensor.random.uniform([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
 
-        def test_default_fp_64():
+        def test_default_fp64():
             paddle.framework.set_default_dtype('float64')
             out = paddle.tensor.random.uniform([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
-        test_default_fp_64()
-        test_default_fp_32()
+        test_default_fp64()
+        test_default_fp32()
 
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index c8383bb950d3ed7b2cdfafa185b0ad156bf7c7bf..deb49a3ffc2b5febf97680bc652e9695fb253373 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -33,16 +33,28 @@ class TestVarBase(unittest.TestCase):
         def _test_place(place):
             with fluid.dygraph.guard():
                 paddle.set_default_dtype('float32')
+                # set_default_dtype should not take effect on int
                 x = paddle.to_tensor(1, place=place, stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1]))
                 self.assertNotEqual(x.dtype, core.VarDesc.VarType.FP32)
 
+                # set_default_dtype should not take effect on numpy
+                x = paddle.to_tensor(
+                    np.array([1.2]).astype('float16'),
+                    place=place,
+                    stop_gradient=False)
+                self.assertTrue(
+                    np.array_equal(x.numpy(), np.array([1.2], 'float16')))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP16)
+
+                # set_default_dtype take effect on float
                 x = paddle.to_tensor(1.2, place=place, stop_gradient=False)
                 self.assertTrue(
                     np.array_equal(x.numpy(), np.array([1.2]).astype(
                         'float32')))
                 self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
 
+                # set_default_dtype take effect on complex
                 x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1 + 2j]))
                 self.assertEqual(x.dtype, 'complex64')
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index af788874191335ad31d1540bcc0db90cc12383c6..f33e4e0fca8727574bcd1970e26c6eaee2139a05 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -20,8 +20,8 @@ __all__ = [
 ]
 
 __all__ += [
-    'grad', 'LayerList', 'load', 'save', 'to_variable', 'no_grad',
-    'DataParallel'
+    'grad', 'LayerList', 'load', 'save', 'SaveLoadConfig', 'to_variable',
+    'no_grad', 'DataParallel'
 ]
 
 __all__ += [
@@ -50,6 +50,7 @@ from ..fluid.dygraph.base import to_variable  #DEFINE_ALIAS
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import load_dygraph as load  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import save_dygraph as save  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
 from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
 
 from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index 2555d24464112ed8446d863dc8e65cfa37680b36..ba2cf603d4a69f118320e40f1f953cb4c5fcfb39 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -16,7 +16,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 
-__all__ = ['manual_seed']
+__all__ = ['manual_seed', 'get_cuda_rng_state', 'set_cuda_rng_state']
 
 
 def manual_seed(seed):
@@ -42,10 +42,69 @@ def manual_seed(seed):
 
     seed = int(seed)
 
+    if core.is_compiled_with_cuda():
+        for i in range(core.get_cuda_device_count()):
+            core.default_cuda_generator(i)._is_init_py = True
+            core.default_cuda_generator(i).manual_seed(seed)
+
     core.default_cpu_generator()._is_init_py = True
     return core.default_cpu_generator().manual_seed(seed)
 
 
+def get_cuda_rng_state():
+    """
+
+    Get random state of cuda generators.
+
+    Args:
+        None
+
+    Returns:
+        GeneratorState:  object.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            sts = paddle.get_cuda_rng_state()
+
+    """
+    state_list = []
+    if core.is_compiled_with_cuda():
+        for i in range(core.get_cuda_device_count()):
+            state_list.append(core.default_cuda_generator(i).get_state())
+
+    return state_list
+
+
+def set_cuda_rng_state(state_list):
+    """
+
+    Sets generator state for all cuda generators
+
+    Args:
+        state_list(list): The cuda states to set back to cuda generators. state_list is obtained from get_cuda_rng_state().
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            sts = paddle.get_cuda_rng_state()
+            paddle.set_cuda_rng_state(sts)
+
+    """
+    if core.is_compiled_with_cuda():
+        if not len(state_list) == core.get_cuda_device_count():
+            raise ValueError(
+                "Length of cuda state list shoule be equal to the cuda device count"
+            )
+        for i in range(core.get_cuda_device_count()):
+            core.default_cuda_generator(i).set_state(state_list[i])
+
+
 def _manual_program_seed(seed):
     """
     Sets global seed for generating random numbers.
diff --git a/python/paddle/hapi/__init__.py b/python/paddle/hapi/__init__.py
index 87f5a82525cdfa36e48d40c6d12488d359fe99db..67965de5d97621e188acfa1e0384325b9ec5b7aa 100644
--- a/python/paddle/hapi/__init__.py
+++ b/python/paddle/hapi/__init__.py
@@ -14,14 +14,12 @@
 
 from . import logger
 from . import callbacks
+from . import model_summary
 
 from . import model
 from .model import *
-
-from .dygraph_layer_patch import monkey_patch_layer
+from .model_summary import summary
 
 logger.setup_logger()
 
-__all__ = ['callbacks'] + model.__all__
-
-monkey_patch_layer()
+__all__ = ['callbacks'] + model.__all__ + ['summary']
diff --git a/python/paddle/hapi/dygraph_layer_patch.py b/python/paddle/hapi/dygraph_layer_patch.py
deleted file mode 100644
index e3a2948b69305fcb08c14c850f5738ac46aea2be..0000000000000000000000000000000000000000
--- a/python/paddle/hapi/dygraph_layer_patch.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-import paddle.fluid as fluid
-from paddle.fluid.framework import in_dygraph_mode
-from paddle.fluid.framework import _current_expected_place as _get_device
-
-
-def monkey_patch_layer():
-    def load_dict(self,
-                  stat_dict,
-                  include_sublayers=True,
-                  use_structured_name=True):
-        '''
-        Set parameters from stat_dict. All the parameters will be reset by the
-        tensor in the stat_dict
-
-        This api will be Deprecated. Please use set_dict
-
-        Parameters:
-            state_dict(dict) : Dict contains all the parameters
-            include_sublayers(bool, optional) : If true, also include the
-                parameters from sublayers. Default: True
-            use_structured_name(bool, optional) : If true, use structured name
-                as key, otherwise, use parameter name as key. Default: True
-        Returns:
-            None
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
-
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
-                    emb.load_dict( para_state_dict )
-
-        '''
-
-        def _check_match(key, param):
-            state = stat_dict.get(key, None)
-            if state is None:
-                raise ValueError(
-                    "{} is not found in the providing file.".format(key))
-            if list(state.shape) != list(param.shape):
-                raise ValueError(
-                    "{} receives a shape {}, but the expected shape is {}.".
-                    format(key, list(state.shape), list(param.shape)))
-            return param, state
-
-        matched_param_state = []
-        for key, param in self.state_dict().items():
-            key_name = key if use_structured_name else param.name
-            try:
-                match_res = _check_match(key_name, param)
-                matched_param_state.append(match_res)
-            except ValueError as err:
-                warnings.warn(("Skip loading for {}. ".format(key) + str(err)))
-
-        if in_dygraph_mode():
-            for param, state in matched_param_state:
-                param.set_value(state)
-        else:
-
-            def _set_var(var, ndarray):
-                t = fluid.global_scope().find_var(var.name).get_tensor()
-                p = t._place()
-                if p.is_cpu_place():
-                    place = fluid.CPUPlace()
-                elif p.is_cuda_pinned_place():
-                    place = fluid.CUDAPinnedPlace()
-                else:
-                    p = fluid.core.Place()
-                    p.set_place(t._place())
-                    place = fluid.CUDAPlace(p.gpu_device_id())
-                t.set(ndarray, place)
-
-            executor = fluid.Executor(_get_device())._default_executor
-            # restore parameter states
-            fluid.core._create_loaded_parameter(
-                [param for param, state in matched_param_state],
-                fluid.global_scope(), executor)
-            for param, state in matched_param_state:
-                _set_var(param, state)
-
-    setattr(fluid.dygraph.Layer, 'load_dict', load_dict)
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 5aa689ca324c099f239a29e2ee21b8283e378341..2836a151ec35698a31f3814d573828853349a151 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -47,10 +47,10 @@ from paddle.io import DataLoader, Dataset, DistributedBatchSampler
 from paddle.fluid.executor import scope_guard, Executor
 from paddle.fluid.dygraph.layers import Layer
 from paddle.metric import Metric
-
 from paddle.static import InputSpec as Input
 
 from .callbacks import config_callbacks
+from .model_summary import summary
 
 __all__ = ['Model', ]
 
@@ -731,8 +731,8 @@ class DynamicGraphAdapter(object):
         if not self.model._optimizer or not optim_state:
             return
 
-        # If optimizer performs set_dict when state vars haven't been created,
-        # which would happen when set_dict before minimize, the state would be
+        # If optimizer performs set_state_dict when state vars haven't been created,
+        # which would happen when set_state_dict before minimize, the state would be
         # stored in optimizer._accumulators_holder and loaded lazily.
         # To contrive this when loading from static-graph saved states, extend
         # state dict to include keys named accoring to dygraph naming rules.
@@ -776,7 +776,13 @@ class DynamicGraphAdapter(object):
                                      accum_name + "_0")
                     converted_state[dy_state_name] = state_var
 
-        self.model._optimizer.set_dict(converted_state)
+        if not hasattr(self.model._optimizer, 'set_state_dict'):
+            warnings.warn(
+                "paddle.fluid.optimizer is deprecated in API 2.0, please use paddle.optimizer instead"
+            )
+            self.model._optimizer.set_dict(converted_state)
+        else:
+            self.model._optimizer.set_state_dict(converted_state)
 
 
 class Model(object):
@@ -1822,6 +1828,54 @@ class Model(object):
             return logs, outputs
         return logs
 
+    def summary(self, input_size=None, batch_size=None, dtype=None):
+        """Prints a string summary of the network.
+
+        Args:
+            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor. 
+                    if not set, input_size will get from ``self._inputs`` if network only have 
+                    one input, input_size can be tuple or InputSpec. if model have multiple 
+                    input, input_size must be a list which contain every input's shape. 
+                    Default: None.
+            batch_size (int, optional): batch size of input tensor, Default: None.
+            dtypes (str, optional): if dtypes is None, 'float32' will be used, Default: None.
+
+        Returns:
+            Dict: a summary of the network including total params and total trainable params.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              from paddle.static import InputSpec
+
+              dynamic = True
+              device = paddle.set_device('cpu')
+              paddle.disable_static(device) if dynamic else None
+           
+              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
+              label = InputSpec([None, 1], 'int64', 'label')
+           
+              model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+                  input, label)
+              optim = paddle.optimizer.Adam(
+                  learning_rate=0.001, parameters=model.parameters())
+              model.prepare(
+                  optim,
+                  paddle.nn.CrossEntropyLoss())
+
+              params_info = model.summary()
+              print(params_info)
+
+        """
+        assert (input_size is not None or self._inputs is not None
+                ), "'input_size' or 'self._input' must be set"
+        if input_size is not None:
+            _input_size = input_size
+        else:
+            _input_size = self._inputs
+        return summary(self.network, _input_size, batch_size, dtype)
+
     def _verify_spec(self, specs, is_input=False):
         out_specs = []
 
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..716be1b539809ea3f90885b512f51ac45d85cd37
--- /dev/null
+++ b/python/paddle/hapi/model_summary.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+from paddle.static import InputSpec
+
+from collections import OrderedDict
+
+__all__ = ['summary']
+
+
+def summary(net, input_size, batch_size=None, dtypes=None):
+    """Prints a string summary of the network.
+
+    Args:
+        net (Layer): the network which must be a subinstance of Layer.
+        input_size (tuple|InputSpec|list[tuple|InputSpec]): size of input tensor. if model only 
+                    have one input, input_size can be tuple or InputSpec. if model
+                    have multiple input, input_size must be a list which contain 
+                    every input's shape.
+        batch_size (int, optional): batch size of input tensor, Default: None.
+        dtypes (str, optional): if dtypes is None, 'float32' will be used, Default: None.
+
+    Returns:
+        Dict: a summary of the network including total params and total trainable params.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            class LeNet(nn.Layer):
+                def __init__(self, num_classes=10):
+                    super(LeNet, self).__init__()
+                    self.num_classes = num_classes
+                    self.features = nn.Sequential(
+                        nn.Conv2d(
+                            1, 6, 3, stride=1, padding=1),
+                        nn.ReLU(),
+                        nn.MaxPool2d(2, 2),
+                        nn.Conv2d(
+                            6, 16, 5, stride=1, padding=0),
+                        nn.ReLU(),
+                        nn.MaxPool2d(2, 2))
+
+                    if num_classes > 0:
+                        self.fc = nn.Sequential(
+                            nn.Linear(400, 120),
+                            nn.Linear(120, 84),
+                            nn.Linear(
+                                84, 10))
+
+                def forward(self, inputs):
+                    x = self.features(inputs)
+
+                    if self.num_classes > 0:
+                        x = paddle.flatten(x, 1)
+                        x = self.fc(x)
+                    return x
+
+            lenet = LeNet()
+
+            params_info = paddle.summary(lenet, (1, 28, 28))
+            print(params_info)
+
+    """
+    if isinstance(input_size, InputSpec):
+        _input_size = tuple(input_size.shape[1:])
+        if batch_size is None:
+            batch_size = input_size.shape[0]
+    elif isinstance(input_size, list):
+        _input_size = []
+        for item in input_size:
+            if isinstance(item, int):
+                item = (item, )
+            assert isinstance(item,
+                              (tuple, InputSpec)), 'When input_size is list, \
+            expect item in input_size is a tuple or InputSpec, but got {}'.format(
+                                  type(item))
+
+            if isinstance(item, InputSpec):
+                _input_size.append(tuple(item.shape[1:]))
+                if batch_size is None:
+                    batch_size = item.shape[0]
+            else:
+                _input_size.append(item)
+    elif isinstance(input_size, int):
+        _input_size = (input_size, )
+    else:
+        _input_size = input_size
+
+    if batch_size is None:
+        batch_size = -1
+
+    result, params_info = summary_string(net, _input_size, batch_size, dtypes)
+    print(result)
+
+    return params_info
+
+
+def summary_string(model, input_size, batch_size=-1, dtypes=None):
+    if dtypes == None:
+        dtypes = ['float32'] * len(input_size)
+
+    summary_str = ''
+
+    depth = len(list(model.sublayers()))
+
+    def register_hook(module):
+        def hook(module, input, output):
+            class_name = str(module.__class__).split(".")[-1].split("'")[0]
+
+            try:
+                module_idx = int(module._full_name.split('_')[-1])
+            except:
+                module_idx = len(summary)
+
+            m_key = "%s-%i" % (class_name, module_idx + 1)
+            summary[m_key] = OrderedDict()
+            summary[m_key]["input_shape"] = list(input[0].shape)
+            summary[m_key]["input_shape"][0] = batch_size
+            if isinstance(output, (list, tuple)):
+                summary[m_key]["output_shape"] = [[-1] + list(o.shape)[1:]
+                                                  for o in output]
+            else:
+                summary[m_key]["output_shape"] = list(output.shape)
+                summary[m_key]["output_shape"][0] = batch_size
+
+            params = 0
+            if hasattr(module, "weight") and hasattr(module.weight, "shape"):
+                params += np.prod(module.weight.shape)
+                summary[m_key]["trainable"] = module.weight.trainable or (
+                    not module.weight.stop_gradient)
+            if hasattr(module, "bias") and hasattr(module.bias, "shape"):
+                params += np.prod(module.bias.shape)
+            summary[m_key]["nb_params"] = params
+
+        if (not isinstance(module, nn.Sequential) and
+                not isinstance(module, nn.LayerList) and
+            (not (module == model) or depth < 1)):
+
+            hooks.append(module.register_forward_post_hook(hook))
+
+    if isinstance(input_size, tuple):
+        input_size = [input_size]
+
+    x = [
+        paddle.rand(
+            [2] + list(in_size), dtype=dtype)
+        for in_size, dtype in zip(input_size, dtypes)
+    ]
+
+    # create properties
+    summary = OrderedDict()
+    hooks = []
+
+    # register hook
+    model.apply(register_hook)
+
+    # make a forward pass
+    model(*x)
+
+    # remove these hooks
+    for h in hooks:
+        h.remove()
+
+    table_width = 80
+    summary_str += "-" * table_width + "\n"
+    line_new = "{:>15} {:>20} {:>20} {:>15}".format(
+        "Layer (type)", "Input Shape", "Output Shape", "Param #")
+    summary_str += line_new + "\n"
+    summary_str += "=" * table_width + "\n"
+    total_params = 0
+    total_output = 0
+    trainable_params = 0
+    for layer in summary:
+        # input_shape, output_shape, trainable, nb_params
+        line_new = "{:>15} {:>20} {:>20} {:>15}".format(
+            layer,
+            str(summary[layer]["input_shape"]),
+            str(summary[layer]["output_shape"]),
+            "{0:,}".format(summary[layer]["nb_params"]), )
+        total_params += summary[layer]["nb_params"]
+
+        total_output += np.prod(summary[layer]["output_shape"])
+        if "trainable" in summary[layer]:
+            if summary[layer]["trainable"] == True:
+                trainable_params += summary[layer]["nb_params"]
+        summary_str += line_new + "\n"
+
+    # assume 4 bytes/number (float on cuda).
+    total_input_size = abs(
+        np.prod(sum(input_size, ())) * batch_size * 4. / (1024**2.))
+    total_output_size = abs(2. * total_output * 4. /
+                            (1024**2.))  # x2 for gradients
+    total_params_size = abs(total_params * 4. / (1024**2.))
+    total_size = total_params_size + total_output_size + total_input_size
+
+    summary_str += "=" * table_width + "\n"
+    summary_str += "Total params: {0:,}".format(total_params) + "\n"
+    summary_str += "Trainable params: {0:,}".format(trainable_params) + "\n"
+    summary_str += "Non-trainable params: {0:,}".format(total_params -
+                                                        trainable_params) + "\n"
+    summary_str += "-" * table_width + "\n"
+    summary_str += "Input size (MB): %0.2f" % total_input_size + "\n"
+    summary_str += "Forward/backward pass size (MB): %0.2f" % total_output_size + "\n"
+    summary_str += "Params size (MB): %0.2f" % total_params_size + "\n"
+    summary_str += "Estimated Total Size (MB): %0.2f" % total_size + "\n"
+    summary_str += "-" * table_width + "\n"
+    # return summary
+    return summary_str, {
+        'total_params': total_params,
+        'trainable_params': trainable_params
+    }
diff --git a/python/paddle/jit/__init__.py b/python/paddle/jit/__init__.py
index 03299a3bb9823d31c40ae4faab601ed89570c71e..d04a65ad6ea99ee2e2e67e47fd9d656f1572a02d 100644
--- a/python/paddle/jit/__init__.py
+++ b/python/paddle/jit/__init__.py
@@ -14,7 +14,6 @@
 
 from ..fluid.dygraph.jit import save  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import load  #DEFINE_ALIAS
-from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import TracedLayer  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import set_code_level  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import set_verbosity  #DEFINE_ALIAS
@@ -23,6 +22,6 @@ from ..fluid.dygraph import ProgramTranslator  #DEFINE_ALIAS
 from ..fluid.dygraph.io import TranslatedLayer  #DEFINE_ALIAS
 
 __all__ = [
-    'save', 'load', 'SaveLoadConfig', 'TracedLayer', 'to_static',
-    'ProgramTranslator', 'TranslatedLayer', 'set_code_level', 'set_verbosity'
+    'save', 'load', 'TracedLayer', 'to_static', 'ProgramTranslator',
+    'TranslatedLayer', 'set_code_level', 'set_verbosity'
 ]
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 3c0aa9c5c99e545b657559c30fcde46a69781231..325eaa64d5ca4bd3d65bf266ff0a42226a3199e6 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -233,3 +233,4 @@ from .vision import space_to_depth  #DEFINE_ALIAS
 from .vision import yolo_box  #DEFINE_ALIAS
 from .vision import yolov3_loss  #DEFINE_ALIAS
 from .input import one_hot  #DEFINE_ALIAS
+from .input import embedding  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 623af3277fba0e29fb77b02c711e258602f1f75a..ad84a32186e8baeabbe8eea7d14e2b7391332944 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -910,12 +910,12 @@ def dropout(x,
             #get mask shape
             input_shape = x.shape
             drop_axes = [axis] if isinstance(axis, int) else axis
-            if max(drop_axes) > len(input_shape) - 1:
-                raise ValueError("axis value should less than dimensions of x:{}, but get drop_axes value:{} " \
+            if min(drop_axes) < 0 or max(drop_axes) > len(input_shape) - 1:
+                raise ValueError("axis value should be greater than or equal to 0 and less than dimensions of x:{}, but get axis value:{} " \
                                  .format(len(input_shape), max(drop_axes)))
             if len(drop_axes) > len(input_shape):
                 raise ValueError(
-                    "length of axis should not greater than dimensions of x:{}, but get length of drop axes: {}".
+                    "length of axis should not be greater than dimensions of x:{}, but get length of axis: {}".
                     format(len(input_shape), len(drop_axes)))
             mask_shape = [1] * len(input_shape)
             for i in drop_axes:
@@ -1091,6 +1091,8 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
                                  'alpha_dropout')
 
     if training:
+        if p == 1:
+            return layers.scale(x, scale=0.)
         #get transformation params
         alpha = 1.6732632423543772848170429916717
         scale = 1.0507009873554804934193349852946
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 42d7d98aefcbbf51f562b98c4c494aeccfe20cf2..3c1482e69c3c36232ee5d70f2156a8d16c2d212a 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -232,7 +232,7 @@ def conv1d(x,
         raise ValueError("Attr(data_format) should be 'NCL' or 'NLC'. "
                          "Received Attr(data_format): {}.".format(data_format))
 
-    channel_last = (data_format == "NHWC")
+    channel_last = (data_format == "NLC")
     channel_dim = -1 if channel_last else 1
     conv2d_data_format = "NHWC" if channel_last else "NCHW"
     num_channels = x.shape[channel_dim]
@@ -399,7 +399,7 @@ def conv2d(x,
             `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when 
             `data_format` is `"NCHW"`, `padding` can be in the form `[[0,0], [0,0], 
             [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            when `data_format` is `"NHWC"`, `padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel
@@ -733,20 +733,31 @@ def conv_transpose1d(x,
 
     stride = utils.convert_to_list(stride, 1, 'stride') + [1]
     dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
-    output_padding = utils.convert_to_list(output_padding, 1,
-                                           'output_padding') + [0]
-    if output_padding[0] > stride[0]:
-        raise ValueError(
-            "The size of output_padding should not be greater than stride."
-            "But got output_padding={} and stride={}".format(output_padding[0],
-                                                             stride[0]))
 
     if output_size is None:
         output_size = []
-    elif isinstance(output_size, (list, tuple, int)):
-        output_size = utils.convert_to_list(output_size, 1, 'output_size') + [1]
     else:
-        raise ValueError("output_size should be int, or list, tuple of ints")
+        if output_padding != 0:
+            raise ValueError('output_padding option is mutually exclusive with '
+                             'output_size')
+        if isinstance(output_size, (list, tuple, int)):
+            output_size = utils.convert_to_list(output_size, 1,
+                                                'output_size') + [1]
+        else:
+            raise ValueError(
+                "output_size should be int, or list, tuple of ints")
+
+    if output_padding == 0:
+        output_padding = []
+    else:
+        output_padding = utils.convert_to_list(output_padding, 1,
+                                               'output_padding') + [0]
+
+    if len(output_padding) > 0 and output_padding[0] > stride[0]:
+        raise ValueError(
+            "The size of output_padding should not be greater than stride."
+            "But got output_padding={} and stride={}".format(output_padding[0],
+                                                             stride[0]))
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
@@ -761,16 +772,17 @@ def conv_transpose1d(x,
     weight = nn.unsqueeze(input=weight, axes=[-1])
 
     if in_dygraph_mode():
-        attrs = ('output_size', output_size, 'strides', stride, 'paddings',
-                 padding, 'padding_algorithm', padding_algorithm, 'dilations',
-                 dilation, 'groups', groups, 'use_cudnn', use_cudnn,
-                 'data_format', conv2d_data_format)
+        attrs = ('output_padding', output_padding, 'output_size', output_size,
+                 'strides', stride, 'paddings', padding, 'padding_algorithm',
+                 padding_algorithm, 'dilations', dilation, 'groups', groups,
+                 'use_cudnn', use_cudnn, 'data_format', conv2d_data_format)
         out = getattr(core.ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
     else:
         inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
+            'output_padding': output_padding,
             'output_size': output_size,
             'strides': stride,
             'paddings': padding,
@@ -791,12 +803,6 @@ def conv_transpose1d(x,
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
 
-    if output_size is None:
-        out = pad2d(
-            out,
-            padding=[0, output_padding, 0, 0],
-            data_format=conv2d_data_format,
-            name=name)
     out = nn.squeeze(input=out, axes=[squeeze_axis])
     return out
 
@@ -888,9 +894,9 @@ def conv_transpose2d(x,
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_height, pad_width]` or 
             `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCHW"`, `pool_padding` can be in the form 
+            and when `data_format` is `"NCHW"`, `padding` can be in the form 
             `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form 
+            when `data_format` is `"NHWC"`, `padding` can be in the form 
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
@@ -1116,9 +1122,9 @@ def conv3d(x,
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
             `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            and when `data_format` is `"NCDHW"`, `padding` can be in the form
             `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel points. 
@@ -1340,9 +1346,9 @@ def conv_transpose3d(x,
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
             `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            and when `data_format` is `"NCDHW"`, `padding` can be in the form
             `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index e77bf0e39672984f7076938b134f3e54f4c761ab..bc48cc21c29e6683602f37fb3eab6c9485fe4977 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -19,7 +19,7 @@ from ...fluid.layer_helper import LayerHelper
 from ...fluid.layers import core
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 
-__all__ = ['one_hot']
+__all__ = ['one_hot', 'embedding']
 
 
 def one_hot(x, num_classes, name=None):
@@ -83,6 +83,7 @@ def one_hot(x, num_classes, name=None):
             #                       [0., 1., 0., 0.],
             #                       [0., 0., 0., 1.],
             #                       [1., 0., 0., 0.]]
+
     """
 
     if in_dygraph_mode():
@@ -94,7 +95,7 @@ def one_hot(x, num_classes, name=None):
 
         one_hot_out = helper.create_variable_for_type_inference(dtype='float32')
         if not isinstance(num_classes, Variable):
-            # user attribute 
+            # user attribute
             inputs = {'X': x}
             attrs = {'depth': num_classes, 'allow_out_of_range': False}
         else:
@@ -108,3 +109,115 @@ def one_hot(x, num_classes, name=None):
             outputs={'Out': one_hot_out},
             stop_gradient=True)
         return one_hot_out
+
+
+def embedding(x, weight, padding_idx=None, sparse=False, name=None):
+    """
+    The operator is used to lookup embeddings vector of ids provided by :attr:`input` .
+
+    The shape of output Tensor is generated by appending the last dimension of the input Tensor shape
+    with embedding size.
+    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < weight.shape[0]` ,
+    otherwise the program will throw an exception and exit.
+
+    .. code-block:: text
+
+        Case 1:
+            input is a Tensor. 
+                padding_idx = -1
+                x.data = [[1, 3], [2, 4], [4, 127]]
+                x.shape = [3, 2]
+                weight.shape = [128, 16]
+            output is a Tensor:
+                out.shape = [3, 2, 16]
+                out.data = [[[0.129435295, 0.244512452, ..., 0.436322452],
+                            [0.345421456, 0.524563927, ..., 0.144534654]],
+                            [[0.345249859, 0.124939536, ..., 0.194353745],
+                            [0.945345345, 0.435394634, ..., 0.435345365]],
+                            [[0.945345345, 0.435394634, ..., 0.435345365],
+                            [0.0,         0.0,         ..., 0.0        ]]]  # padding data
+
+            The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
+            It will pad all-zero data when ids is 127.
+
+    Args:
+        x(Tensor): A Tensor with type int32/int64, which contains the id information. The value of the input id should
+            satisfy :math:`0<= id < weight.shape[0]` .
+        weight (Tensor): The weight. A Tensor with shape of lookup table parameter. It should have two elements which
+            indicates the size of the dictionary of embeddings and the size of each embedding vector respectively.
+        sparse(bool): The flag indicating whether to use sparse update. This parameter only
+            affects the performance of the backwards gradient update. It is recommended to set
+            True because sparse update is faster. But some optimizers does not support sparse update,
+            such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
+            :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
+            :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
+            In these cases, is_sparse must be False. Default: False.
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
+            If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
+            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
+            encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
+            If set None, it makes no effect to output. Default: None.
+        name(str|None): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
+           None by default.
+
+    Returns:
+        Tensor: Embedding Tensor  mapped by input. The data type is the same as :attr:`weight`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            weight = prog.global_block().create_parameter(
+                attr=self._param_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                default_initializer=Constant(1.0))
+
+            prog = paddle.static.Program()
+
+            weight = prog.global_block().create_parameter(
+                    (128, 100), dtype="float32", default_initializer=Constant(1.0))
+
+            label = paddle.data(
+                    name="label",
+                    shape=[4],
+                    append_batch_size=False,
+                    dtype="int64")
+
+            emb = nn.embedding(
+                    x=label, weight=weight, sparse=True, name="embedding")
+
+    """
+    if in_dygraph_mode():
+        return core.ops.lookup_table_v2(
+            weight, x, 'is_sparse', sparse, 'is_distributed', False,
+            'remote_prefetch', False, 'padding_idx', padding_idx)
+    else:
+        helper = LayerHelper('embedding', **locals())
+        dtype = helper.input_dtype()
+
+        check_variable_and_dtype(x, 'input', ['int32', 'int64'], 'embedding')
+
+        is_distributed = False
+        remote_prefetch = sparse and (not is_distributed)
+
+        tmp = helper.create_variable_for_type_inference(dtype)
+        padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
+            weight.shape[0] + padding_idx)
+
+        helper.append_op(
+            type='lookup_table_v2',
+            inputs={'Ids': x,
+                    'W': weight},
+            outputs={'Out': tmp},
+            attrs={
+                'is_sparse': sparse,
+                'is_distributed': is_distributed,
+                'remote_prefetch': remote_prefetch,
+                'padding_idx': padding_idx
+            })
+        return tmp
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index d2ddee654f4d04de152d15130ba53c424af3e5b2..3d5894064c44cb72259472fc638d46b67c5703fc 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -138,13 +138,10 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            input_data = np.array([0.5, 0.6, 0.7]).astype("float32")
-            label_data = np.array([1.0, 0.0, 1.0]).astype("float32")
 
             paddle.disable_static()
-            input = paddle.to_tensor(input_data)
-            label = paddle.to_tensor(label_data)
+            input = paddle.to_tensor([0.5, 0.6, 0.7], 'float32')
+            label = paddle.to_tensor([1.0, 0.0, 1.0], 'float32')
             output = paddle.nn.functional.binary_cross_entropy(input, label)
             print(output.numpy())  # [0.65537095]
 
@@ -277,8 +274,8 @@ def binary_cross_entropy_with_logits(logit,
 
             import paddle
             paddle.disable_static()
-            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
-            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
+            logit = paddle.to_tensor([5.0, 1.0, 3.0])
+            label = paddle.to_tensor([1.0, 0.0, 1.0])
             output = paddle.nn.functional.binary_cross_entropy_with_logits(logit, label)
             print(output.numpy())  # [0.45618808]
 
@@ -569,13 +566,10 @@ def l1_loss(input, label, reduction='mean', name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
-            label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
-            input = paddle.to_tensor(input_data)
-            label = paddle.to_tensor(label_data)
+            input = paddle.to_tensor([[1.5, 0.8], [0.2, 1.3]])
+            label = paddle.to_tensor([[1.7, 1], [0.4, 0.5]])
 
             l1_loss = paddle.nn.functional.l1_loss(input, label)
             print(l1_loss.numpy())
@@ -868,7 +862,7 @@ def mse_loss(input, label, reduction='mean', name=None):
     Examples:
 
         .. code-block:: python
-            import numpy as np
+
             import paddle
 
 
@@ -878,8 +872,6 @@ def mse_loss(input, label, reduction='mean', name=None):
             input = paddle.data(name="input", shape=[1])
             label = paddle.data(name="label", shape=[1])
             place = paddle.CPUPlace()
-            input_data = np.array([1.5]).astype("float32")
-            label_data = np.array([1.7]).astype("float32")
 
             output = mse_loss(input,label)
             exe = paddle.static.Executor(place)
@@ -894,8 +886,8 @@ def mse_loss(input, label, reduction='mean', name=None):
 
             # dynamic graph mode
             paddle.disable_static()
-            input = paddle.to_variable(input_data)
-            label = paddle.to_variable(label_data)
+            input = paddle.to_tensor(1.5)
+            label = paddle.to_tensor(1.7)
             output = mse_loss(input, label)
             print(output.numpy())
             # [0.04000002]
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index e9c1a21ecffb1b64cb5ae9e6b802600625cb4685..9e8f365f6d23a95275b9a696f6088bb287108ec0 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -165,7 +165,7 @@ def batch_norm(x,
           w = paddle.to_tensor(weight_data)
           b = paddle.to_tensor(bias_data)
           batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv, w, b)
-          print batch_norm_out
+          print(batch_norm_out.numpy())
     """
 
     assert len(x.shape) >= 2, "input dim must be larger than 1"
@@ -176,6 +176,14 @@ def batch_norm(x,
     mean_out = running_mean
     variance_out = running_var
 
+    true_data_format = ['NC', 'NCL', 'NCHW', 'NCDHW']
+    if data_format not in true_data_format:
+        raise ValueError(
+            "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', but receive {}".
+            format(data_format))
+
+    data_format = 'NCHW'
+
     if in_dygraph_mode():
         # for dygraph need tuple
         attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
@@ -270,7 +278,7 @@ def layer_norm(x,
           layer_norm = paddle.nn.functional.layer_norm(x, x.shape[1:])
           layer_norm_out = layer_norm(x)
 
-          print(layer_norm_out.numpy)
+          print(layer_norm_out.numpy())
     """
     input_shape = list(x.shape)
     input_ndim = len(input_shape)
@@ -302,10 +310,10 @@ def layer_norm(x,
     # create output
     helper = LayerHelper('layer_norm', **locals())
     mean_out = helper.create_variable_for_type_inference(
-        dtype=x.type, stop_gradient=True)
+        dtype=x.dtype, stop_gradient=True)
     variance_out = helper.create_variable_for_type_inference(
-        dtype=x.type, stop_gradient=True)
-    layer_norm_out = helper.create_variable_for_type_inference(x.type)
+        dtype=x.dtype, stop_gradient=True)
+    layer_norm_out = helper.create_variable_for_type_inference(x.dtype)
 
     helper.append_op(
         type="layer_norm",
@@ -362,7 +370,7 @@ def instance_norm(x,
           x = paddle.to_tensor(x_data) 
           instance_norm_out = paddle.nn.functional.instancenorm(x)
 
-          print(instance_norm_out.numpy)
+          print(instance_norm_out.numpy())
 
     """
 
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index c8790a75901fd5d9a38862158246e3756dc575c4..662205ab69550255406ff5edfda4556b73b98843 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -168,7 +168,7 @@ def avg_pool1d(x,
                count_include_pad=True,
                ceil_mode=False,
                name=None):
-    """ 
+    """
     This API implements average pooling 1d operation,
     See more details in :ref:`api_nn_pooling_AvgPool1d` .
 
@@ -280,7 +280,7 @@ def avg_pool2d(x,
     """
     This API implements average pooling 2d operation.
     See more details in :ref:`api_nn_pooling_AvgPool2d` .
- 
+
     Args:
         x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
                           shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
@@ -640,7 +640,7 @@ def max_pool2d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs.
+        return_indices (bool): Whether to return the max indices along with the outputs. Default False, only support `"NCHW"` data format
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
                         The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_height, input_width]`.
@@ -690,15 +690,30 @@ def max_pool2d(x,
     padding, padding_algorithm = _update_padding_nd(
         padding, num_dims=2, channel_last=channel_last, ceil_mode=ceil_mode)
 
+    if data_format == "NHWC" and return_indices:
+        raise ValueError(
+            "When setting return_indices to true, data_format must be set to NCHW in API:max_pool2d"
+        )
+
     if in_dygraph_mode():
-        output = core.ops.max_pool2d_with_index(
-            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
-            'paddings', padding, 'padding_algorithm', padding_algorithm,
-            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
-            'exclusive', True, 'data_format', data_format)
-        return output if return_indices else output[0]
+        if data_format == "NCHW":
+            output = core.ops.max_pool2d_with_index(
+                x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
+                stride, 'paddings', padding, 'padding_algorithm',
+                padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+                'use_mkldnn', False, 'exclusive', True, 'data_format',
+                data_format)
+            return output if return_indices else output[0]
+        elif data_format == "NHWC" and not return_indices:
+            output = core.ops.pool2d(
+                x, 'pooling_type', 'max', 'ksize', kernel_size,
+                'global_pooling', False, 'padding_algorithm', padding_algorithm,
+                'strides', stride, 'paddings', padding, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
+            return output
 
-    op_type = 'max_pool2d_with_index'
+    op_type = 'max_pool2d_with_index' if data_format == "NCHW" else "max_pool2d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -739,7 +754,7 @@ def max_pool3d(x,
     See more details in :ref:`api_nn_pooling_MaxPool3d` .
     Args:
         x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
-                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` or `"NDHWC"`, where N represents batch size, C represents the number of channels, D, H and W represent the depth, height and width of the feature respectively. 
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` or `"NDHWC"`, where N represents batch size, C represents the number of channels, D, H and W represent the depth, height and width of the feature respectively.
         kernel_size (int|list|tuple): The pool kernel size. If the kernel size
             is a tuple or list, it must contain three integers,
             (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
@@ -755,7 +770,7 @@ def max_pool3d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
-        return_indices (bool): Whether to return the max indices along with the outputs.
+        return_indices (bool): Whether to return the max indices along with the outputs. Default False. Only support "NDCHW" data_format.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
                         The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_depth, input_height, input_width]`.
@@ -801,15 +816,30 @@ def max_pool3d(x,
     padding, padding_algorithm = _update_padding_nd(
         padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
 
+    if data_format == "NDHWC" and return_indices:
+        raise ValueError(
+            "When setting return_indices to true, data_format must be set to NCDHW in API:max_pool3d"
+        )
+
     if in_dygraph_mode():
-        output = core.ops.max_pool3d_with_index(
-            x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides', stride,
-            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', True, 'data_format', data_format)
-        return output if return_indices else output[0]
+        if data_format == "NCDHW":
+            output = core.ops.max_pool3d_with_index(
+                x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
+                stride, 'paddings', padding, 'global_pooling', False,
+                'padding_algorithm', padding_algorithm, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
+            return output if return_indices else output[0]
+        elif data_format == "NDHWC" and not return_indices:
+            output = core.ops.pool3d(
+                x, 'pooling_type', 'max', 'ksize', kernel_size,
+                'global_pooling', False, 'padding_algorithm', padding_algorithm,
+                'strides', stride, 'paddings', padding, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
+            return output
 
-    op_type = "max_pool3d_with_index"
+    op_type = "max_pool3d_with_index" if data_format == "NCDHW" else "max_pool3d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -841,14 +871,13 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     """
     This API implements adaptive average pooling 1d operation.
     See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
-    
+
     Args:
         x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
                               with shape [N, C, L].  The format of input tensor is NCL,
                               where N is batch size, C is the number of channels, L is the
                               length of the feature. The data type is float32 or float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-                it must contain one int.
+        output_size (int): The target output size. It must be an integer.
         name(str, optional): For detailed information, please refer
                                  to :ref:`api_guide_Name`. Usually name is no need to set and
                                  None by default.
@@ -856,7 +885,7 @@ def adaptive_avg_pool1d(x, output_size, name=None):
             Tensor: The output tensor of adaptive average pooling result. The data type is same
                       as input tensor.
     Raises:
-            ValueError: 'output_size' should be an integer or list or tuple with length as 1.
+            ValueError: 'output_size' should be an integer.
     Examples:
         .. code-block:: python
               # average adaptive pool1d
@@ -977,6 +1006,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
     if isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 2, 'output_size')
     else:
+        output_size = list(output_size)
         if output_size[0] == None:
             output_size[0] = in_h
         if output_size[1] == None:
@@ -1080,6 +1110,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     if isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 3, 'output_size')
     else:
+        output_size = list(output_size)
         if output_size[0] == None:
             output_size[0] = in_l
         if output_size[1] == None:
@@ -1124,8 +1155,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
                               with shape [N, C, L].  The format of input tensor is NCL,
                               where N is batch size, C is the number of channels, L is the
                               length of the feature. The data type is float32 or float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-                it must contain one int.
+        output_size (int): The pool kernel size. The value should be an integer.
         return_indices (bool): If true, the index of max pooling point will be returned along
                 with outputs. It cannot be set in average pooling type. Default False.
         name(str, optional): For detailed information, please refer
@@ -1135,9 +1165,10 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
             Tensor: The output tensor of adaptive pooling result. The data type is same
                       as input tensor.
     Raises:
-            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
+            ValueError: 'output_size' should be an integer.
     Examples:
         .. code-block:: python
+
               # max adaptive pool1d
               # suppose input data in shape of [N, C, L], `output_size` is m or [m],
               # output shape is [N, C, m], adaptive pool divide L dimension
@@ -1163,7 +1194,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
     check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                              'adaptive_max_pool1d')
     _check_input(x, 3)
-    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
+    check_type(output_size, 'pool_size', int, 'adaptive_max_pool1d')
     check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
 
     pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
@@ -1202,15 +1233,19 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
     """
         This operation applies a 2D adaptive max pooling on input tensor.
         See more details in :ref:`api_nn_pooling_AdaptiveMaxPool2d` .
+
         Args:
             x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float16, float32, float64, int32 or int64.
             output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two elements, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
             return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
             name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+
         Returns:
             Tensor: The output tensor of adaptive max pool2d result. The data type is same as input tensor.
+
         Examples:
             .. code-block:: python
+
               # max adaptive pool2d
               # suppose input data in the shape of [N, C, H, W], `output_size` is [m, n]
               # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
@@ -1248,6 +1283,7 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
     if isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 2, 'output_size')
     else:
+        output_size = list(output_size)
         if output_size[0] == None:
             output_size[0] = in_h
         if output_size[1] == None:
@@ -1284,15 +1320,19 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
     """
         This operation applies a 3D adaptive max pooling on input tensor.
         See more details in :ref:`api_nn_pooling_AdaptiveMaxPool3d` .
+
         Args:
             x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
             output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
             return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
             name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+
         Returns:
             Tensor: The output tensor of adaptive max pool3d result. The data type is same as input tensor.
+
         Examples:
             .. code-block:: python
+
               # adaptive max pool3d
               # suppose input data in the shape of [N, C, D, H, W], `output_size` is [l, m, n]
               # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
@@ -1334,6 +1374,7 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
     if isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 3, 'output_size')
     else:
+        output_size = list(output_size)
         if output_size[0] == None:
             output_size[0] = in_l
         if output_size[1] == None:
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 8641e28e37b00ed301b95c66d3d2d2d1e3641051..d8e1d03b02840e76ff865986d8b90ca9d6cdd9f8 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -15,7 +15,7 @@
 # TODO: define the common classes to build a neural network
 from ...fluid.dygraph import BilinearTensorProduct  #DEFINE_ALIAS
 from ...fluid.dygraph import Pool2D  #DEFINE_ALIAS
-from ...fluid.dygraph import Embedding  #DEFINE_ALIAS
+from ...fluid.dygraph import Linear  #DEFINE_ALIAS
 from ...fluid.dygraph import Flatten  #DEFINE_ALIAS
 from ...fluid.dygraph import layers
 from .. import functional as F
@@ -146,9 +146,9 @@ class UpSample(layers.Layer):
         'nearest' : Nearest neighbor interpolation
         'bicubic' : Bicubic interpolation
 
-    Linear interpolation is the method of using a line connecting two known quantities 
-    to determine the value of an unknown quantity between the two known quantities. 
-    
+    Linear interpolation is the method of using a line connecting two known quantities
+    to determine the value of an unknown quantity between the two known quantities.
+
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
     direction) on input tensor.
@@ -158,7 +158,7 @@ class UpSample(layers.Layer):
     W-direction in this op) on a rectilinear 2D grid. The key idea is
     to perform linear interpolation first in one direction, and then
     again in the other direction.
-    
+
     Bicubic interpolation is an extension of cubic interpolation for interpolating
     data points on a two-dimensional regular grid. The interpolated surface is
     smoother than corresponding surfaces obtained by bilinear interpolation or
@@ -205,7 +205,7 @@ class UpSample(layers.Layer):
               output: (N,C,H_out,W_out) where:
               H_out = round(H_{in} * scale_{factor})
               W_out = round(W_{in} * scale_{factor})
-        
+
         Bilinear interpolation:
           if:
               align_corners = False , align_mode = 0
@@ -252,19 +252,19 @@ class UpSample(layers.Layer):
 
     https://en.wikipedia.org/wiki/Linear_interpolation.
     For details of linear interpolation, please refer to Wikipedia:
-    
+
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-    
+
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation.
-    
+
     For details of bicubic interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bicubic_interpolation
-    
+
     For details of trilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
-    
+
     Parameters:
         x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
@@ -537,8 +537,8 @@ class Pad2D(layers.Layer):
     If mode is 'reflect', paddings[0] and paddings[1] must be no greater
     than height-1. And the width dimension has the same condition.
     Parameters:
-        paddings (int | List[int32]): The padding size. If padding is a int, uses the same 
-            padding in all boundaries, if padding is a List, it must contain four integers, 
+        paddings (int | List[int32]): The padding size. If padding is a int, uses the same
+            padding in all boundaries, if padding is a List, it must contain four integers,
             (padding_top, padding_bottom, padding_left, padding_right).
             Default is [0, 0, 0, 0].
         mode (str): Three modes: 'constant' (default), 'reflect', 'edge' .
@@ -550,7 +550,7 @@ class Pad2D(layers.Layer):
         data_format (str): An string from: "NHWC", "NCHW". Specify the data format of
                            the input data.
                            Default is  "NCHW"
-    Returns: 
+    Returns:
         None
     Examples:
         .. code-block:: text
@@ -631,11 +631,11 @@ class Bilinear(layers.Layer):
        in1_features (int): The dimension of each first input(`x1`).
        in2_features (int): The dimension of each second input(`x2`).
        out_features (int): The dimension of output of this layer.
-       weight_attr (ParamAttr, optional): The parameter attribute for the learnable w, parameters/weights of 
+       weight_attr (ParamAttr, optional): The parameter attribute for the learnable w, parameters/weights of
        this layer. The default value is None.
        bias_attr (ParamAttr, optional): The parameter attribute for the bias
            of this layer. If it is set to False, no bias will be added to the output units.
-           If it is set to None, the bias is initialized zero. The default value is None.       
+           If it is set to None, the bias is initialized zero. The default value is None.
        name (str, optional): The default value is None. Normally there is no need for user
            to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
 
@@ -702,7 +702,7 @@ class Dropout(layers.Layer):
     """
     Dropout is a regularization technique for reducing overfitting by preventing
     neuron co-adaption during training as described in the paper:
-    `Improving neural networks by preventing co-adaptation of feature detectors <https://arxiv.org/abs/1207.0580>`_ 
+    `Improving neural networks by preventing co-adaptation of feature detectors <https://arxiv.org/abs/1207.0580>`_
     The dropout operator randomly sets the outputs of some units to zero, while upscale others
     according to the given dropout probability.
 
@@ -771,8 +771,8 @@ class Dropout2d(layers.Layer):
     Randomly zero out entire channels (in the batched input 4d tensor with the shape `NCHW` ,
     a channel is a 2D feature map with the shape `HW`). Each channel will be zeroed out independently
     on every forward call with probability `p` using samples from a Bernoulli distribution.
-    Dropout2d will help promote independence between feature maps as described in the paper: 
-    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_ 
+    Dropout2d will help promote independence between feature maps as described in the paper:
+    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_
 
     See ``paddle.nn.functional.dropout2d`` for more details.
 
@@ -829,8 +829,8 @@ class Dropout3d(layers.Layer):
     Randomly zero out entire channels (in the batched input 5d tensor with the shape `NCDHW` ,
     a channel is a 3D feature map with the shape `DHW` ). Each channel will be zeroed out independently
     on every forward call with probability `p` using samples from a Bernoulli distribution.
-    Dropout3d will help promote independence between feature maps as described in the paper: 
-    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_ 
+    Dropout3d will help promote independence between feature maps as described in the paper:
+    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_
 
     See ``paddle.nn.functional.dropout3d`` for more details.
 
@@ -1547,3 +1547,131 @@ class CosineSimilarity(layers.Layer):
 
     def forward(self, x1, x2):
         return F.cosine_similarity(x1, x2, axis=self._axis, eps=self._eps)
+
+
+class Embedding(layers.Layer):
+    """
+    :alias_main: paddle.nn.Embedding
+	:alias: paddle.nn.Embedding,paddle.nn.layer.Embedding,paddle.nn.layer.common.Embedding
+	:old_api: paddle.fluid.dygraph.Embedding
+
+    **Embedding Layer**
+
+    This interface is used to construct a callable object of the ``Embedding`` class.
+    For specific usage, refer to code examples. It implements the function of the Embedding Layer.
+    This layer is used to lookup embeddings vector of ids provided by :attr:`input` .
+    It automatically constructs a 2D embedding matrix based on the
+    input :attr:`size` (vocab_size, emb_size) and :attr:`dtype` .
+
+    The shape of output Tensor is generated by appending an emb_size dimension to the
+    last dimension of the input Tensor shape.
+
+    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` ,
+    otherwise the program will throw an exception and exit.
+
+    .. code-block:: text
+
+        Case 1:
+
+        input is a Tensor. padding_idx = -1
+            input.data = [[1, 3], [2, 4], [4, 127]
+            input.shape = [3, 2]
+        Given size = [128, 16]
+        output is a Tensor:
+            out.shape = [3, 2, 16]
+            out.data = [[[0.129435295, 0.244512452, ..., 0.436322452],
+                        [0.345421456, 0.524563927, ..., 0.144534654]],
+
+                        [[0.345249859, 0.124939536, ..., 0.194353745],
+                        [0.945345345, 0.435394634, ..., 0.435345365]],
+
+                        [[0.945345345, 0.435394634, ..., 0.435345365],
+                        [0.0,         0.0,         ..., 0.0        ]]]  # padding data
+        The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
+        It will pad all-zero data when ids is 127.
+
+    Parameters:
+        num_embeddings (int): Just one element which indicate the size
+            of the dictionary of embeddings.
+        embedding_dim:  Just one element which indicate the size of each embedding vector respectively.
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
+            If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
+            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
+            encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
+            If set None, it makes no effect to output. Default: None.
+        sparse(bool): The flag indicating whether to use sparse update. This parameter only
+            affects the performance of the backwards gradient update. It is recommended to set
+            True because sparse update is faster. But some optimizer does not support sparse update,
+            such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
+            :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
+            :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
+            In these case, is_sparse must be False. Default: False.
+        weight_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
+            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
+            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
+            The local word vector needs to be transformed into numpy format, and the shape of local word
+            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
+            is used to load custom or pre-trained word vectors. See code example 2 for details.
+        name(str|None): For detailed information, please refer
+               to :ref:`api_guide_Name`. Usually name is no need to set and
+               None by default.
+
+    Attribute:
+        **weight** (Parameter): the learnable weights of this layer.
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # example 1
+          inp_word = np.array([[2, 3, 5], [4, 2, 1]]).astype('int64')
+          inp_word.shape  # [2, 3]
+          dict_size = 20
+
+          emb = nn.Embedding(
+                    dict_size,
+                    32,
+                    sparse=False)
+    """
+
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 padding_idx=None,
+                 sparse=False,
+                 weight_attr=None,
+                 name=None):
+        super(Embedding, self).__init__()
+        self._num_embeddings = num_embeddings
+        self._embedding_dim = embedding_dim
+        self._sparse = sparse
+        self._is_distributed = False
+        self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
+            num_embeddings + padding_idx)
+        self._dtype = self._helper.get_default_dtype()
+        self._size = [self._num_embeddings, self._embedding_dim]
+
+        self._weight_attr = weight_attr
+        self._remote_prefetch = False
+        self._name = name
+        self._weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=self._size,
+            dtype=self._dtype,
+            is_bias=False)
+
+    def forward(self, x):
+        return F.embedding(
+            x,
+            weight=self._weight,
+            padding_idx=self._padding_idx,
+            sparse=self._sparse,
+            name=self._name)
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 4e342c00528a2c0115940bb7f695e1ed5b582382..a610693a0a46b7e21d2c6d83716a7bc029677583 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -113,7 +113,7 @@ class _ConvNd(layers.Layer):
             attr=self._bias_attr, shape=[self._out_channels], is_bias=True)
 
 
-class Conv1d(layers.Layer):
+class Conv1d(_ConvNd):
     """
     This interface is used to construct a callable object of the ``Conv1d`` class.
     For more details, refer to code examples.
@@ -172,8 +172,7 @@ class Conv1d(layers.Layer):
             When in 'replicate' mode, uses input boundaries to pad the input tensor.
             When in 'circular' mode, uses circular input to pad the input tensor.
             Default is 'zeros'.
-        bias(bool, optional): Whether to use bias. Default: True.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of conv1d. If it is set to None or one attribute of ParamAttr, conv1d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
@@ -227,205 +226,15 @@ class Conv1d(layers.Layer):
                  dilation=1,
                  groups=1,
                  padding_mode='zeros',
-                 bias=True,
                  weight_attr=None,
                  bias_attr=None,
-                 data_format="NCL",
-                 name=None):
-        super(Conv1d, self).__init__()
-        assert weight_attr is not False, "param_attr should not be False here."
-        self._in_channels = in_channels
-        self._out_channels = out_channels
-        self._groups = groups
-        if in_channels % groups != 0:
-            raise ValueError("in_channels must be divisible by groups.")
-        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
-        self._stride = utils.convert_to_list(stride, 1, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
-        self._padding = padding  # leave it to F.conv1d
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._data_format = data_format
-        self._name = name
-
-        self._padding_mode = padding_mode
-
-        valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
-        if padding_mode not in valid_padding_modes:
-            raise ValueError(
-                "padding_mode must be one of {}, but got padding_mode='{}'".
-                format(valid_padding_modes, padding_mode))
-
-        if padding_mode in {'reflect', 'replicate', 'circular'
-                            } and not isinstance(padding, np.int):
-            raise ValueError(
-                "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int"
-            )
-        if not isinstance(padding, str):
-            self._padding = utils.convert_to_list(padding, 1, 'padding') * 2
-
-        num_filter_channels = in_channels // groups
-        filter_shape = [self._out_channels, num_filter_channels
-                        ] + self._kernel_size
-
-        self.weight = self.create_parameter(
-            attr=self._weight_attr,
-            shape=filter_shape,
-            default_initializer=_get_default_param_initializer(
-                self._in_channels, filter_shape))
-        self.bias = self.create_parameter(
-            attr=self._bias_attr, shape=[self._out_channels],
-            is_bias=True) if bias else None
-
-    def forward(self, x):
-        padding = 0
-        if self._padding_mode != "zeros":
-            x = F.pad(x,
-                      self._padding,
-                      mode=self._padding_mode,
-                      data_format=self._data_format)
-        else:
-            padding = self._padding
-
-        out = F.conv1d(
-            x,
-            self.weight,
-            bias=self.bias,
-            padding=padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            data_format=self._data_format,
-            name=self._name)
-        return out
-
-
-class Conv2d(_ConvNd):
-    """
-    This interface is used to construct a callable object of the ``Conv2d`` class.
-    For more details, refer to code examples.
-    The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input and
-    Output are in NCHW format, where N is batch size, C is the number of
-    the feature map, H is the height of the feature map, and W is the width of the feature map.
-    Filter's shape is [MCHW] , where M is the number of output feature map,
-    C is the number of input feature map, H is the height of the filter,
-    and W is the width of the filter. If the groups is greater than 1,
-    C will equal the number of input feature map divided by the groups.
-    Please refer to UFLDL's `convolution
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
-    for more details.
-    If bias attribution and activation type are provided, bias is added to the
-    output of the convolution, and the corresponding activation function is
-    applied to the final result.
-    For each input :math:`X`, the equation is:
-
-    ..  math::
-
-        Out = \sigma (W \\ast X + b)
-
-    Where:
-
-    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
-    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    
-    Parameters:
-        in_channels(int): The number of input channels in the input image.
-        out_channels(int): The number of output channels produced by the convolution.
-        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
-            contain three integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. The default value is 1.
-        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
-            1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
-            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
-            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
-            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
-            The default value is 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
-            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. The default value is 1.
-        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
-        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
-            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If it is set to None, the parameter
-            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
-        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. The default value is None.
-        data_format(str, optional): Data format that specifies the layout of input.
-            It can be "NCHW" or "NHWC". Default: "NCHW".
-
-    Attribute:
-
-        **weight** (Parameter): the learnable weights of filter of this layer.
-
-        **bias** (Parameter or None): the learnable bias of this layer.
-
-    Shape:
-
-        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        ..  math::
-
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
-
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
-
-    Examples:
-
-        .. code-block:: python
-
-          import numpy as np
-          import paddle
-          import paddle.nn as nn
-          x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
-          
-          paddle.disable_static()
-          x_var = paddle.to_tensor(x)
-          conv = nn.Conv2d(4, 6, (3, 3))
-          y_var = conv(x_var)
-          y_np = y_var.numpy()
-          print(y_np.shape)
-          
-          # (2, 6, 6, 6)
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 padding_mode='zeros',
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCHW"):
-        super(Conv2d, self).__init__(
+                 data_format="NCL"):
+        super(Conv1d, self).__init__(
             in_channels,
             out_channels,
             kernel_size,
             False,
-            2,
+            1,
             stride=stride,
             padding=padding,
             padding_mode=padding_mode,
@@ -436,25 +245,20 @@ class Conv2d(_ConvNd):
             data_format=data_format)
 
     def forward(self, x):
-        if self._padding_mode != 'zeros':
+        padding = 0
+        if self._padding_mode != "zeros":
             x = F.pad(x,
-                      self._reversed_padding_repeated_twice,
+                      self._padding,
                       mode=self._padding_mode,
                       data_format=self._data_format)
-            return F.conv2d(
-                x,
-                self.weight,
-                bias=self.bias,
-                stride=self._stride,
-                dilation=self._dilation,
-                groups=self._groups,
-                data_format=self._data_format)
+        else:
+            padding = self._padding
 
-        out = F.conv2d(
+        out = F.conv1d(
             x,
             self.weight,
             bias=self.bias,
-            padding=self._padding,
+            padding=padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
@@ -462,7 +266,7 @@ class Conv2d(_ConvNd):
         return out
 
 
-class ConvTranspose1d(layers.Layer):
+class ConvTranspose1d(_ConvNd):
     """
     This interface is used to construct a callable object of the ``ConvTranspose1d`` class.
     For more details, refer to code examples.
@@ -603,34 +407,24 @@ class ConvTranspose1d(layers.Layer):
                  padding=0,
                  output_padding=0,
                  groups=1,
-                 bias=True,
                  dilation=1,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCL"):
-        super(ConvTranspose1d, self).__init__()
-        assert weight_attr is not False, "param_attr should not be False in ConvTranspose1d."
-        self._param_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._groups = groups
-        self._in_channels = in_channels
-        self._out_channels = out_channels
-        self._output_padding = output_padding
-        self._data_format = data_format
-        self._bias = bias
-
-        self._stride = utils.convert_to_list(stride, 1, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
-        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
-        self._padding = padding
-
-        filter_shape = [self._in_channels, out_channels // groups
-                        ] + self._kernel_size
-        self.weight = self.create_parameter(
-            shape=filter_shape, attr=self._param_attr)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr, shape=[self._out_channels],
-            is_bias=True) if self._bias else None
+        super(ConvTranspose1d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            1,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
 
     def forward(self, x, output_size=None):
         out = F.conv_transpose1d(
@@ -638,7 +432,169 @@ class ConvTranspose1d(layers.Layer):
             self.weight,
             bias=self.bias,
             output_size=output_size,
-            output_padding=self._output_padding,
+            output_padding=self.output_padding,
+            padding=self._padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format)
+        return out
+
+
+class Conv2d(_ConvNd):
+    """
+    This interface is used to construct a callable object of the ``Conv2d`` class.
+    For more details, refer to code examples.
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW format, where N is batch size, C is the number of
+    the feature map, H is the height of the feature map, and W is the width of the feature map.
+    Filter's shape is [MCHW] , where M is the number of output feature map,
+    C is the number of input feature map, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input feature map divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more details.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+    For each input :math:`X`, the equation is:
+
+    ..  math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
+    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    
+    Parameters:
+        in_channels(int): The number of input channels in the input image.
+        out_channels(int): The number of output channels produced by the convolution.
+        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. The default value is 1.
+        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+            1. a string in ['valid', 'same'].
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
+            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
+            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
+            The default value is 0.
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
+        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. The default value is 1.
+        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If it is set to None, the parameter
+            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
+            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. The default value is None.
+        data_format(str, optional): Data format that specifies the layout of input.
+            It can be "NCHW" or "NHWC". Default: "NCHW".
+
+    Attribute:
+
+        **weight** (Parameter): the learnable weights of filter of this layer.
+
+        **bias** (Parameter or None): the learnable bias of this layer.
+
+    Shape:
+
+        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+        Where
+
+        ..  math::
+
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+
+    Examples:
+
+        .. code-block:: python
+
+          import numpy as np
+          import paddle
+          import paddle.nn as nn
+          x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
+          
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          conv = nn.Conv2d(4, 6, (3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
+          
+          # (2, 6, 6, 6)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCHW"):
+        super(Conv2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            2,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x):
+        if self._padding_mode != 'zeros':
+            x = F.pad(x,
+                      self._reversed_padding_repeated_twice,
+                      mode=self._padding_mode,
+                      data_format=self._data_format)
+            return F.conv2d(
+                x,
+                self.weight,
+                bias=self.bias,
+                stride=self._stride,
+                dilation=self._dilation,
+                groups=self._groups,
+                data_format=self._data_format)
+
+        out = F.conv2d(
+            x,
+            self.weight,
+            bias=self.bias,
             padding=self._padding,
             stride=self._stride,
             dilation=self._dilation,
@@ -920,8 +876,8 @@ class Conv3d(_ConvNd):
                  in_channels,
                  out_channels,
                  kernel_size,
-                 padding=0,
                  stride=1,
+                 padding=0,
                  dilation=1,
                  groups=1,
                  padding_mode='zeros',
@@ -1128,7 +1084,7 @@ class ConvTranspose3d(_ConvNd):
             bias_attr=bias_attr,
             data_format=data_format)
 
-    def forward(self, x, output_size):
+    def forward(self, x, output_size=None):
         if output_size is None:
             output_padding = self.output_padding
         else:
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 4d25418579d74ae896f8ca590400a0a334047e93..d13bf66ba5bfe483284e78dbcd2a42f8f3397210 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -78,7 +78,7 @@ class _InstanceNormBase(layers.Layer):
         super(_InstanceNormBase, self).__init__()
 
         if weight_attr == False or bias_attr == False:
-            assert weight_attr == param_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
+            assert weight_attr == bias_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
         self._epsilon = epsilon
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
@@ -176,7 +176,7 @@ class InstanceNorm1d(_InstanceNormBase):
           instance_norm = paddle.nn.InstanceNorm1d(2)
           instance_norm_out = instance_norm(x)
 
-          print(instance_norm_out.numpy)
+          print(instance_norm_out.numpy())
 
     """
 
@@ -253,7 +253,7 @@ class InstanceNorm2d(_InstanceNormBase):
           instance_norm = paddle.nn.InstanceNorm2d(2)
           instance_norm_out = instance_norm(x)
 
-          print(instance_norm_out.numpy)
+          print(instance_norm_out.numpy())
     """
 
     def _check_input_dim(self, input):
@@ -329,7 +329,7 @@ class InstanceNorm3d(_InstanceNormBase):
           instance_norm = paddle.nn.InstanceNorm3d(2)
           instance_norm_out = instance_norm(x)
 
-          print(instance_norm_out.numpy)
+          print(instance_norm_out.numpy())
     """
 
     def _check_input_dim(self, input):
@@ -346,8 +346,8 @@ class GroupNorm(layers.Layer):
     Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
 
     Parameters:
-        num_channels(int): The number of channels of input.
         num_groups(int): The number of groups that divided from channels.
+        num_channels(int): The number of channels of input.
         epsilon(float, optional): The small value added to the variance to prevent
                                   division by zero. Default: 1e-05.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
@@ -375,19 +375,19 @@ class GroupNorm(layers.Layer):
           np.random.seed(123)
           x_data = np.random.random(size=(2, 6, 2, 2)).astype('float32')
           x = paddle.to_tensor(x_data) 
-          group_norm = paddle.nn.GroupNorm(num_channels=3, num_groups=6)
+          group_norm = paddle.nn.GroupNorm(num_channels=6, num_groups=6)
           group_norm_out = group_norm(x)
 
-          print(group_norm_out.numpy)
+          print(group_norm_out.numpy())
     """
 
     def __init__(self,
-                 num_channels,
                  num_groups,
+                 num_channels,
                  epsilon=1e-05,
                  weight_attr=None,
                  bias_attr=None,
-                 data_layout='NCHW',
+                 data_format='NCHW',
                  name=None):
         super(GroupNorm, self).__init__()
         self._weight_attr = weight_attr
@@ -395,18 +395,33 @@ class GroupNorm(layers.Layer):
         self._epsilon = epsilon
         self._num_channels = num_channels
         self._num_groups = num_groups
-        if data_layout != 'NCHW':
+        if data_format != 'NCHW':
             raise ValueError("unsupported data layout:" + data_layout)
 
         param_shape = [self._num_channels]
 
-        self.weight = self.create_parameter(
-            attr=self._weight_attr or False,
-            shape=param_shape,
-            default_initializer=Constant(1.0))
+        if weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
 
-        self.bias = self.create_parameter(
-            attr=self._weight_attr or False, shape=param_shape, is_bias=True)
+        if bias_attr == False:
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+            self.bias.stop_gradient = True
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
 
     def forward(self, input):
         inputs = {'X': input}
@@ -500,7 +515,7 @@ class LayerNorm(layers.Layer):
           layer_norm = paddle.nn.LayerNorm(x_data.shape[1:])
           layer_norm_out = layer_norm(x)
 
-          print(layer_norm_out.numpy)
+          print(layer_norm_out.numpy())
     """
 
     def __init__(self,
@@ -603,8 +618,7 @@ class _BatchNormBase(layers.Layer):
                 initializer=Constant(0.0),
                 trainable=False,
                 do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
+            shape=param_shape)
         self._mean.stop_gradient = True
 
         self._variance = self.create_parameter(
@@ -613,8 +627,7 @@ class _BatchNormBase(layers.Layer):
                 initializer=Constant(1.0),
                 trainable=False,
                 do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
+            shape=param_shape)
         self._variance.stop_gradient = True
 
         self._data_format = data_format
@@ -628,8 +641,13 @@ class _BatchNormBase(layers.Layer):
     def _check_input_dim(self, input):
         raise NotImplementedError("BatchNorm Base error")
 
+    def _check_data_format(self, input):
+        raise NotImplementedError("BatchNorm Base data format error")
+
     def forward(self, input):
 
+        self._check_data_format(self._data_format)
+
         self._check_input_dim(input)
 
         if not self.training and not self._track_running_stats:
@@ -730,9 +748,15 @@ class BatchNorm1d(_BatchNormBase):
           batch_norm = paddle.nn.BatchNorm1d(1)
           batch_norm_out = batch_norm(x)
 
-          print(batch_norm_out.numpy)
+          print(batch_norm_out.numpy())
     """
 
+    def _check_data_format(self, input):
+        if input == 'NCHW' or input == 'NC' or input == 'NCL':
+            self._data_format = 'NCHW'
+        else:
+            raise ValueError('expected NC , NCL or None for data_format input')
+
     def _check_input_dim(self, input):
         if len(input.shape) != 2 and len(input.shape) != 3:
             raise ValueError('expected 2D or 3D input (got {}D input)'.format(
@@ -787,7 +811,7 @@ class BatchNorm2d(_BatchNormBase):
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW". Default: NCHW.
         track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
             True will track global mean and variance used for inference. When inference, track_running_stats must be 
             True. Default: True.
@@ -816,9 +840,15 @@ class BatchNorm2d(_BatchNormBase):
           batch_norm = paddle.nn.BatchNorm2d(1)
           batch_norm_out = batch_norm(x)
 
-          print(batch_norm_out.numpy)
+          print(batch_norm_out.numpy())
     """
 
+    def _check_data_format(self, input):
+        if input == 'NCHW':
+            self._data_format = input
+        else:
+            raise ValueError('expected NCHW for data_format input')
+
     def _check_input_dim(self, input):
         if len(input.shape) != 4:
             raise ValueError('expected 4D input (got {}D input)'.format(
@@ -902,9 +932,15 @@ class BatchNorm3d(_BatchNormBase):
           batch_norm = paddle.nn.BatchNorm3d(1)
           batch_norm_out = batch_norm(x)
 
-          print(batch_norm_out.numpy)
+          print(batch_norm_out.numpy())
     """
 
+    def _check_data_format(self, input):
+        if input == 'NCHW' or input == 'NCDHW':
+            self._data_format = 'NCHW'
+        else:
+            raise ValueError('expected NCDHW or None for data_format input')
+
     def _check_input_dim(self, input):
         if len(input.shape) != 5:
             raise ValueError('expected 5D input (got {}D input)'.format(
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 6f6b567849732ff889db4507708758cd8eeab2a8..129dae93b38327308263550e73031b607b2eacc3 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -87,6 +87,7 @@ class AvgPool1d(layers.Layer):
     Examples:
 
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           paddle.disable_static()
@@ -176,6 +177,7 @@ class AvgPool2d(layers.Layer):
         ShapeError: If the output's shape calculated is not greater than 0.
     Examples:
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           import numpy as np
@@ -267,6 +269,7 @@ class AvgPool3d(layers.Layer):
 
     Examples:
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           import numpy as np
@@ -457,6 +460,7 @@ class MaxPool2d(layers.Layer):
 
     Examples:
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           import numpy as np
@@ -547,6 +551,7 @@ class MaxPool3d(layers.Layer):
 
     Examples:
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           import numpy as np
@@ -613,8 +618,7 @@ class AdaptiveAvgPool1d(layers.Layer):
        Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
 
     Args:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one int.
+        output_size (int): The target output size. It must be an integer.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -623,7 +627,7 @@ class AdaptiveAvgPool1d(layers.Layer):
         None.
 
     Raises:
-        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+        ValueError: 'output_size' should be an integer.
 
     Shape:
         - x: 3-D tensor.
@@ -850,7 +854,7 @@ class AdaptiveMaxPool1d(layers.Layer):
 
        lend &= ceil((i + 1) * L_{in} / L_{out})
 
-       Output(i) &= max(Input[lstart:lend])}
+       Output(i) &= max(Input[lstart:lend])
 
     Args:
         output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
@@ -916,8 +920,11 @@ class AdaptiveMaxPool2d(layers.Layer):
     """
     This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+
     For adaptive max pool2d:
+
     ..  math::
+
        hstart &= floor(i * H_{in} / H_{out})
        hend &= ceil((i + 1) * H_{in} / H_{out})
        wstart &= floor(j * W_{in} / W_{out})
@@ -932,11 +939,12 @@ class AdaptiveMaxPool2d(layers.Layer):
     Shape:
         x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
         output (Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type is same as input x.
-    
+
     Returns:
         A callable object of AdaptiveMaxPool2d.
     Examples:
         .. code-block:: python
+
             # adaptive max pool2d
             # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
             # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
@@ -977,10 +985,13 @@ class AdaptiveMaxPool2d(layers.Layer):
 
 class AdaptiveMaxPool3d(layers.Layer):
     """
-   This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
+    This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+
     For adaptive max pool3d:
+
     ..  math::
+
       dstart &= floor(i * D_{in} / D_{out})
       dend &= ceil((i + 1) * D_{in} / D_{out})
       hstart &= floor(j * H_{in} / H_{out})
@@ -988,10 +999,9 @@ class AdaptiveMaxPool3d(layers.Layer):
       wstart &= floor(k * W_{in} / W_{out})
       wend &= ceil((k + 1) * W_{in} / W_{out})
       Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend])
+
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
-            the size will be the same as that of the input.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
         return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
@@ -1003,6 +1013,7 @@ class AdaptiveMaxPool3d(layers.Layer):
         A callable object of AdaptiveMaxPool3d.
     Examples:
         .. code-block:: python
+
             # adaptive max pool3d
             # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
             # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
@@ -1029,10 +1040,10 @@ class AdaptiveMaxPool3d(layers.Layer):
             pool = paddle.nn.AdaptiveMaxPool3d(output_size=4)
             out = pool(x)
             # out shape: [2, 3, 4, 4, 4]
-            pool, indices = paddle.nn.AdaptiveMaxPool3d(output_size=3, return_indices=True)
-            out = pool(x)
+            pool = paddle.nn.AdaptiveMaxPool3d(output_size=3, return_indices=True)
+            out, indices = pool(x)
             # out shape: [2, 3, 4, 4, 4], indices shape: [2, 3, 4, 4, 4]
-            
+
     """
 
     def __init__(self, output_size, return_indices=False, name=None):
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 50a8755ac9f7b0a8e35c60f02a9fb825195ab80f..63069e83952172df3136458ebfee4b446749934d 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -25,12 +25,13 @@ __all__ = [
 import copy
 import collections
 
+from .common import Linear, Dropout
+from .norm import LayerNorm
+from .. import functional as F
+from ... import tensor
 from ...fluid import layers
+from ...fluid.dygraph import Layer, LayerList
 from ...fluid.param_attr import ParamAttr
-from ...fluid.dygraph import Layer, Linear, Dropout, LayerNorm, LayerList
-from .. import functional as F
-from ...fluid.layers import utils
-from ...fluid.layers.utils import map_structure
 
 
 def _convert_param_attr_to_list(param_attr, n):
@@ -103,7 +104,7 @@ class MultiHeadAttention(Layer):
             # self attention mask: [batch_size, num_heads, query_len, query_len]
             attn_mask = paddle.rand((2, 2, 4, 4))
             multi_head_attn = paddle.MultiHeadAttention(128, 2)
-            output = multi_head_attn(query, attn_mask=attn_mask)  # [2, 4, 128]
+            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
     """
 
     Cache = collections.namedtuple("Cache", ["k", "v"])
@@ -176,8 +177,8 @@ class MultiHeadAttention(Layer):
                 and their data types are same as inputs.
         """
         q = self.q_proj(query)
-        q = layers.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
-        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
+        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
+        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
 
         if isinstance(cache, self.StaticCache):
             # for encoder-decoder attention in inference and has cached
@@ -187,8 +188,8 @@ class MultiHeadAttention(Layer):
 
         if isinstance(cache, self.Cache):
             # for decoder self-attention in inference
-            k = layers.concat([cache.k, k], axis=2)
-            v = layers.concat([cache.v, v], axis=2)
+            k = tensor.concat([cache.k, k], axis=2)
+            v = tensor.concat([cache.v, v], axis=2)
             cache = self.Cache(k, v)
 
         return (q, k, v) if cache is None else (q, k, v, cache)
@@ -219,10 +220,10 @@ class MultiHeadAttention(Layer):
         """
         k = self.k_proj(key)
         v = self.v_proj(value)
-        k = layers.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
-        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-        v = layers.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
-        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
+        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
+        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
+        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
         return k, v
 
     def gen_cache(self, key, value=None, type=Cache):
@@ -352,24 +353,25 @@ class MultiHeadAttention(Layer):
             q, k, v, cache = self._prepare_qkv(query, key, value, cache)
 
         # scale dot product attention
+        # TODO(guosheng): use tensor.matmul, however it doesn't support `alpha`
         product = layers.matmul(
             x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
         if attn_mask is not None:
             # TODO(guosheng): support bool mask
             product = product + attn_mask
-        weights = layers.softmax(product)
+        weights = F.softmax(product)
         if self.dropout:
-            weights = layers.dropout(
+            weights = F.dropout(
                 weights,
-                dropout_prob=self.dropout,
-                dropout_implementation="upscale_in_train",
-                is_test=False)
+                self.dropout,
+                training=self.training,
+                mode="upscale_in_train")
 
-        out = layers.matmul(weights, v)
+        out = tensor.matmul(weights, v)
 
         # combine heads
-        out = layers.transpose(out, perm=[0, 2, 1, 3])
-        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+        out = tensor.transpose(out, perm=[0, 2, 1, 3])
+        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
 
         # project to output
         out = self.out_proj(out)
@@ -429,7 +431,7 @@ class TransformerEncoderLayer(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle import TransformerEncoderLayer
+            from paddle.nn import TransformerEncoderLayer
 
             # encoder input: [batch_size, src_len, d_model]
             enc_input = paddle.rand((2, 4, 128))
@@ -470,17 +472,14 @@ class TransformerEncoderLayer(Layer):
             bias_attr=bias_attrs[0])
         self.linear1 = Linear(
             d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])
-        self.dropout = Dropout(
-            act_dropout, dropout_implementation="upscale_in_train")
+        self.dropout = Dropout(act_dropout, mode="upscale_in_train")
         self.linear2 = Linear(
             dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])
         self.norm1 = LayerNorm(d_model)
         self.norm2 = LayerNorm(d_model)
-        self.dropout1 = Dropout(
-            dropout, dropout_implementation="upscale_in_train")
-        self.dropout2 = Dropout(
-            dropout, dropout_implementation="upscale_in_train")
-        self.activation = getattr(layers, activation)
+        self.dropout1 = Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
 
     def forward(self, src, src_mask=None):
         """
@@ -539,7 +538,7 @@ class TransformerEncoder(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle import TransformerEncoderLayer, TransformerEncoder
+            from paddle.nn import TransformerEncoderLayer, TransformerEncoder
 
             # encoder input: [batch_size, src_len, d_model]
             enc_input = paddle.rand((2, 4, 128))
@@ -643,7 +642,7 @@ class TransformerDecoderLayer(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle import TransformerDecoderLayer
+            from paddle.nn import TransformerDecoderLayer
 
             # decoder input: [batch_size, tgt_len, d_model]
             dec_input = paddle.rand((2, 4, 128))
@@ -697,20 +696,16 @@ class TransformerDecoderLayer(Layer):
             bias_attr=bias_attrs[1])
         self.linear1 = Linear(
             d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])
-        self.dropout = Dropout(
-            act_dropout, dropout_implementation="upscale_in_train")
+        self.dropout = Dropout(act_dropout, mode="upscale_in_train")
         self.linear2 = Linear(
             dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])
         self.norm1 = LayerNorm(d_model)
         self.norm2 = LayerNorm(d_model)
         self.norm3 = LayerNorm(d_model)
-        self.dropout1 = Dropout(
-            dropout, dropout_implementation="upscale_in_train")
-        self.dropout2 = Dropout(
-            dropout, dropout_implementation="upscale_in_train")
-        self.dropout3 = Dropout(
-            dropout, dropout_implementation="upscale_in_train")
-        self.activation = getattr(layers, activation)
+        self.dropout1 = Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = Dropout(dropout, mode="upscale_in_train")
+        self.dropout3 = Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
 
     def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
         """
@@ -834,7 +829,7 @@ class TransformerDecoder(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle import TransformerDecoderLayer, TransformerDecoder
+            from paddle.nn import TransformerDecoderLayer, TransformerDecoder
 
             # decoder input: [batch_size, tgt_len, d_model]
             dec_input = paddle.rand((2, 4, 128))
@@ -1017,7 +1012,7 @@ class Transformer(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle import Transformer
+            from paddle.nn import Transformer
 
             # src: [batch_size, tgt_len, d_model]
             enc_input = paddle.rand((2, 4, 128))
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 0da8053fe8a3495f5d3188a737638531347de648..3150b8c2d0363274dfb6fd3465110c89339cd4c9 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -45,8 +45,8 @@ class Adam(Optimizer):
     Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
 
     Args:
-        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
-            It can be a float value or a LearningRateDecay. The default value is 0.001.
+        learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a _LRScheduler. The default value is 0.001.
         beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
             It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 0.9.
@@ -55,7 +55,7 @@ class Adam(Optimizer):
             The default value is 0.999.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-08.
-	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
 	    This parameter is required in dygraph mode. \
 	    The default value is None in static mode, at this time all parameters will be updated.
 	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
@@ -143,6 +143,12 @@ class Adam(Optimizer):
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
+        if not 0 <= beta1 < 1:
+            raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
+        if not 0 <= beta2 < 1:
+            raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
+        if not 0 <= epsilon:
+            raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
         super(Adam, self).__init__(
             learning_rate=learning_rate,
             parameters=parameters,
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index 73a78b17cbba55c1ee90a2708f6c163940158a51..cca120efd450768520d9cf027f6a36aaad121d9e 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -47,15 +47,15 @@ class Adamax(Optimizer):
     it is added here for numerical stability to prevent the division by 0 error.
 
     Args:
-        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
-            It can be a float value or a LearningRateDecay. The default value is 0.001.
+        learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a _LRScheduler. The default value is 0.001.
         beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
             The default value is 0.9.
         beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
             The default value is 0.999.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-08.
-	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
 	    This parameter is required in dygraph mode. \
 	    The default value is None in static mode, at this time all parameters will be updated.
 	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
@@ -118,6 +118,12 @@ class Adamax(Optimizer):
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
+        if not 0 <= beta1 < 1:
+            raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
+        if not 0 <= beta2 < 1:
+            raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
+        if not 0 <= epsilon:
+            raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
         super(Adamax, self).__init__(
             learning_rate=learning_rate,
             parameters=parameters,
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index f498fcbffa24ec188b57ceb2d3c6884fc1e135d2..edaca7e8301676c8734eb3e60924844bea0121d9 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -19,112 +19,7 @@ import paddle
 __all__ = ['AdamW']
 
 
-class DecoupledWeightDecay(object):
-    def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
-        if not isinstance(coeff, float) and \
-                not isinstance(coeff, framework.Variable):
-            raise TypeError("coeff should be float or Tensor.")
-        self._params_name = set()
-        self._apply_decay_param_fun = apply_decay_param_fun
-        self._coeff = coeff
-        super(DecoupledWeightDecay, self).__init__(**kwargs)
-
-    def _scale_parameters(self, params_and_grads):
-        """
-        Adds weight decay ops.
-            scaled_parameter = parameter * coeff
-
-        Args:
-            params_and_grads: A list of (parameters, gradients) pairs,
-                the parameters need to decay.
-        Raises:
-            Exception: The type of coeff and parameter is not consistent.
-        """
-        if isinstance(self._coeff, float) and self._coeff == 0.0:
-            return
-
-        scaled_params = []
-        for param, grad in params_and_grads:
-            # If no gradient then we don't need to do anything
-            if grad is None:
-                continue
-            if self._apply_decay_param_fun is not None \
-                    and not self._apply_decay_param_fun(param.name):
-                continue
-
-            if isinstance(self._coeff, float):
-                assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
-                    "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
-            else:
-                assert self._coeff.dtype == param.dtype, \
-                    "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
-
-            with param.block.program._optimized_guard(
-                [param, grad]), framework.name_scope('weight decay'):
-                assert param.name not in self._params_name
-                scaled_params.append((param, grad, param * self._coeff))
-                self._params_name.add(param.name)
-        return scaled_params
-
-    def backward(self, **kargs):
-        return super(DecoupledWeightDecay, self).backward(**kargs)
-
-    def _apply_optimize(self, **kargs):
-        return super(DecoupledWeightDecay, self)._apply_optimize(**kargs)
-
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameters=None,
-                 no_grad_set=None):
-        params_grads = self.backward(
-            loss=loss,
-            startup_program=startup_program,
-            parameters=parameters,
-            no_grad_set=no_grad_set)
-        scaled_params = self._scale_parameters(params_grads)
-        for p_grad_sgrad in scaled_params:
-            param, grad, scaled_param = p_grad_sgrad
-            with param.block.program._optimized_guard(
-                [param, grad]), framework.name_scope('weight decay'):
-                updated_param = paddle.fluid.layers.elementwise_sub(
-                    x=param, y=scaled_param)
-                paddle.fluid.layers.assign(input=updated_param, output=param)
-
-        optimize_ops = self._apply_optimize(
-            loss=loss,
-            params_grads=params_grads,
-            startup_program=startup_program)
-        return optimize_ops, params_grads
-
-    @framework.dygraph_only
-    def step(self):
-        parameter_list = self._parameter_list
-        self._dtype = None
-        params_grads = []
-        for param in self._parameter_list:
-            if not param.trainable:
-                continue
-            if param._grad_ivar() is not None:
-                grad_var = param._grad_ivar()
-                params_grads.append((param, grad_var))
-
-        scaled_params = self._scale_parameters(params_grads)
-        for p_grad_sgrad in scaled_params:
-            param, grad, scaled_param = p_grad_sgrad
-            with param.block.program._optimized_guard(
-                [param, grad]), framework.name_scope('weight decay'):
-                updated_param = paddle.fluid.layers.elementwise_sub(
-                    x=param, y=scaled_param)
-                paddle.fluid.layers.assign(input=updated_param, output=param)
-        optimize_ops = self._apply_optimize(
-            loss=None, startup_program=None, params_grads=params_grads)
-
-    def __str__(self):
-        return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
-
-
-class AdamW(DecoupledWeightDecay, Adam):
+class AdamW(Adam):
     """
     The AdamW optimizer is implemented based on the AdamW Optimization 
     in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
@@ -145,8 +40,8 @@ class AdamW(DecoupledWeightDecay, Adam):
 
 
     Args:
-        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
-            It can be a float value or a LearningRateDecay. The default value is 0.001.
+        learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a _LRScheduler. The default value is 0.001.
 	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
 	    This parameter is required in dygraph mode. \
 	    The default value is None in static mode, at this time all parameters will be updated.
@@ -157,9 +52,9 @@ class AdamW(DecoupledWeightDecay, Adam):
             It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 0.999.
         epsilon (float, optional): A small float value for numerical stability.
-        weight_decay (float|Tensor): The weight decay coefficient, it can be float or Tensor. The default value is 0.0.
             The default value is 1e-08.
-        apply_decay_param_fun (function|None): If it is not None,
+        weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
+        apply_decay_param_fun (function|None, optional): If it is not None,
             only tensors that makes apply_decay_param_fun(Tensor)==True 
             will be updated. It only works when we want to specify tensors.
             Default: None.
@@ -208,26 +103,129 @@ class AdamW(DecoupledWeightDecay, Adam):
 
     def __init__(self,
                  learning_rate=0.001,
-                 parameters=None,
                  beta1=0.9,
                  beta2=0.999,
                  epsilon=1e-8,
-                 weight_decay=0.0,
+                 parameters=None,
+                 weight_decay=0.01,
                  apply_decay_param_fun=None,
                  grad_clip=None,
                  name=None,
                  lazy_mode=False):
-        args_dict = {
-            "learning_rate": learning_rate,
-            "parameters": parameters,
-            "beta1": beta1,
-            "beta2": beta2,
-            "epsilon": epsilon,
-            "grad_clip": grad_clip,
-            "name": name,
-            "lazy_mode": lazy_mode
-        }
+        assert learning_rate is not None
+        assert beta1 is not None
+        assert beta2 is not None
+        assert epsilon is not None
+        if not 0 <= beta1 < 1:
+            raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
+        if not 0 <= beta2 < 1:
+            raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
+        if not 0 <= epsilon:
+            raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
+        coeff = weight_decay
+        if not isinstance(coeff, float) and \
+                not isinstance(coeff, framework.Variable):
+            raise TypeError("coeff should be float or Tensor.")
+        self._params_name = set()
+        self._apply_decay_param_fun = apply_decay_param_fun
+        self._coeff = coeff
         super(AdamW, self).__init__(
-            weight_decay,
-            apply_decay_param_fun=apply_decay_param_fun,
-            **args_dict)
+            learning_rate=learning_rate,
+            parameters=parameters,
+            beta1=beta1,
+            beta2=beta2,
+            epsilon=epsilon,
+            grad_clip=grad_clip,
+            name=name,
+            lazy_mode=lazy_mode)
+
+    def _scale_parameters(self, params_and_grads):
+        """
+        Adds weight decay ops.
+            scaled_parameter = parameter * coeff
+
+        Args:
+            params_and_grads: A list of (parameters, gradients) pairs,
+                the parameters need to decay.
+        Raises:
+            Exception: The type of coeff and parameter is not consistent.
+        """
+
+        scaled_params = []
+        for param, grad in params_and_grads:
+            # If no gradient then we don't need to do anything
+            if grad is None:
+                continue
+            if self._apply_decay_param_fun is not None \
+                    and not self._apply_decay_param_fun(param.name):
+                continue
+
+            if isinstance(self._coeff, float):
+                assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
+                    "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
+            else:
+                assert self._coeff.dtype == param.dtype, \
+                    "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
+            if isinstance(self._learning_rate, float):
+                learning_rate = self._learning_rate
+            else:
+                self._learning_rate()
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                if param.name not in self._params_name:
+                    scaled_params.append(
+                        (param, grad, param * self._coeff * learning_rate))
+                    self._params_name.add(param.name)
+                    param = param * self._coeff
+        return scaled_params
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None):
+        params_grads = self.backward(
+            loss=loss,
+            startup_program=startup_program,
+            parameters=parameters,
+            no_grad_set=no_grad_set)
+        scaled_params = self._scale_parameters(params_grads)
+        for p_grad_sgrad in scaled_params:
+            param, grad, scaled_param = p_grad_sgrad
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                updated_param = paddle.fluid.layers.elementwise_sub(
+                    x=param, y=scaled_param)
+                paddle.fluid.layers.assign(input=updated_param, output=param)
+
+        optimize_ops = self._apply_optimize(
+            loss=loss,
+            params_grads=params_grads,
+            startup_program=startup_program)
+        return optimize_ops, params_grads
+
+    @framework.dygraph_only
+    def step(self):
+        parameter_list = self._parameter_list
+        self._dtype = None
+        params_grads = []
+        for param in self._parameter_list:
+            if not param.trainable:
+                continue
+            if param._grad_ivar() is not None:
+                grad_var = param._grad_ivar()
+                params_grads.append((param, grad_var))
+
+        scaled_params = self._scale_parameters(params_grads)
+        for p_grad_sgrad in scaled_params:
+            param, grad, scaled_param = p_grad_sgrad
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                updated_param = paddle.fluid.layers.elementwise_sub(
+                    x=param, y=scaled_param)
+                param.set_value(updated_param.numpy())
+        optimize_ops = self._apply_optimize(
+            loss=None, startup_program=None, params_grads=params_grads)
+
+    def __str__(self):
+        return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
diff --git a/python/paddle/optimizer/lr_scheduler.py b/python/paddle/optimizer/lr_scheduler.py
index 4ecaffb8fa509bdc54067bb25f8d1b5191b7ac1b..61391704061bda7dfbad7252cbc04c0b7d6492a4 100644
--- a/python/paddle/optimizer/lr_scheduler.py
+++ b/python/paddle/optimizer/lr_scheduler.py
@@ -109,7 +109,7 @@ class _LRScheduler(object):
         """
         self.keys = ['last_epoch', 'last_lr']
 
-    def set_dict(self, state_dict):
+    def set_state_dict(self, state_dict):
         """
         Loads the schedulers state.
         """
@@ -126,8 +126,8 @@ class _LRScheduler(object):
                 "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
             )
 
-    # alias for set_dict
-    set_state_dict = set_dict
+    # alias for set_state_dict
+    set_dict = set_state_dict
 
     def get_lr(self):
         # calculate by python float
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 3f9de0cefc05d1aaee36fa3af5cfa9ae4affcb97..1bd9a1f144ed4b5c69d76070eadc317e2063e25b 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -80,7 +80,6 @@ class Optimizer(object):
         .. code-block:: python
 
             #Take the subclass adam as an example
-            #Optimizer 
             import paddle
             import numpy as np
 
@@ -98,7 +97,7 @@ class Optimizer(object):
 
     """
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def __init__(self,
                  learning_rate,
                  parameters=None,
@@ -170,7 +169,7 @@ class Optimizer(object):
 
                 import paddle
                 paddle.disable_static()
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
 
                 adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
                 state_dict = adam.state_dict()
@@ -200,7 +199,7 @@ class Optimizer(object):
 
                 import paddle
                 paddle.disable_static()
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
 
                 state_dict = emb.state_dict()
                 paddle.framework.save(state_dict, "paddle_dy")
@@ -215,6 +214,8 @@ class Optimizer(object):
                 adam.set_state_dict(opti_state_dict)
 
         '''
+        if isinstance(self._learning_rate, _LRScheduler):
+            self._learning_rate.set_dict(state_dict["LR_Scheduler"])
 
         if isinstance(self._learning_rate, _LRScheduler):
             self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
@@ -270,6 +271,7 @@ class Optimizer(object):
                 main_prog = framework.default_main_program()
                 main_prog.lr_sheduler = self._learning_rate
                 main_prog.lr_var = lr_var
+
                 self._learning_rate_map[framework.default_main_program(
                 )] = lr_var
 
@@ -300,7 +302,7 @@ class Optimizer(object):
         this API cannot be invoked, because it will lead to conflict.
 
         Args:
-            value (float|Tensor): the value of learning rate
+            value (float): the value of learning rate
 
         Returns:
             None
@@ -358,6 +360,7 @@ class Optimizer(object):
         Get current step learning rate. The return value is all the same When _LRScheduler is not used,
         otherwise return the current step learning rate.
 
+
         Returns:
             float: The learning rate of the current step.
 
@@ -368,7 +371,7 @@ class Optimizer(object):
                 import paddle
                 # example1: _LRScheduler is not used, return value is all the same
                 paddle.disable_static()
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
                 adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters())
                 lr = adam.get_lr()
                 print(lr) # 0.001
@@ -655,7 +658,7 @@ class Optimizer(object):
                 paddle.disable_static()
                 value = np.arange(26).reshape(2, 13).astype("float32")
                 a = paddle.to_tensor(value)
-                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
                 adam = paddle.optimizer.Adam(learning_rate = 0.01, 
                                             parameters = linear.parameters())
@@ -798,7 +801,7 @@ class Optimizer(object):
                 paddle.disable_static()
                 value = np.arange(26).reshape(2, 13).astype("float32")
                 a = paddle.to_tensor(value)
-                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
                 adam = paddle.optimizer.Adam(learning_rate = 0.01, 
                                             parameters = linear.parameters())
@@ -812,7 +815,7 @@ class Optimizer(object):
             if p.trainable:
                 p.clear_gradient()
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -836,36 +839,33 @@ class Optimizer(object):
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) tensor pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
+            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
             indicate program pruning. If so, the program will be pruned by ``feed`` and 
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
             .. code-block:: python
-
+ 
                 import paddle
-                import paddle.fluid as fluid
-
-                place = fluid.CPUPlace()
-                main = fluid.Program()
-                with fluid.program_guard(main):
-                    x = fluid.data(name='x', shape=[None, 13], dtype='float32')
-                    y = fluid.data(name='y', shape=[None, 1], dtype='float32')
-                    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-                    avg_cost = fluid.layers.mean(cost)
-
-                    adam_optimizer = paddle.optimizer.Adam(0.01)
-                    adam_optimizer.minimize(avg_cost)
-
-                    fetch_list = [avg_cost]
-                    train_reader = paddle.batch(
-                        paddle.dataset.uci_housing.train(), batch_size=1)
-                    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-                    exe = fluid.Executor(place)
-                    exe.run(fluid.default_startup_program())
-                    for data in train_reader():
-                        exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+                import numpy as np
+
+                paddle.disable_static()
+                inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+                linear = paddle.nn.Linear(10, 10)
+                inp = paddle.to_tensor(inp)
+                out = linear(inp)
+                loss = paddle.mean(out)
+
+                beta1 = paddle.to_tensor([0.9], dtype="float32")
+                beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+                adam = paddle.optimizer.Adam(learning_rate=0.1,
+                        parameters=linear.parameters(),
+                        weight_decay=0.01)
+                out.backward()
+                adam.minimize(loss)
+                adam.clear_grad()
+
         """
         assert isinstance(loss, Variable), "The loss should be an Tensor."
 
@@ -885,7 +885,7 @@ class Optimizer(object):
     @framework.dygraph_only
     def step(self):
         """
-        Execute the optimizer once.
+        Execute the optimizer and update parameters once.
         
         Returns:
             None
@@ -898,7 +898,7 @@ class Optimizer(object):
                 paddle.disable_static()
                 value = np.arange(26).reshape(2, 13).astype("float32")
                 a = paddle.to_tensor(value)
-                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
                 adam = paddle.optimizer.Adam(learning_rate = 0.01, 
                                             parameters = linear.parameters())
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 0bc4c9bfd53dc15449f03d6de6c8942e977bf562..2609972d85ccdc2a867765431fefe21b9ba2de16 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -69,8 +69,8 @@ class RMSProp(Optimizer):
 
 
     Parameters:
-        learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``.
-            It can be a float value or a LearningRateDecay.
+        learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``.
+            It can be a float value or a _LRScheduler.
         rho(float): rho is :math: `\\rho` in equation, default is 0.95.
         epsilon(float): :math: `\\epsilon` in equation is smoothing term to
             avoid division by zero, default is 1e-6.
@@ -80,7 +80,7 @@ class RMSProp(Optimizer):
             the gradient; if False, by the uncentered second moment. Setting this to
             True may help with training, but is slightly more expensive in terms of
             computation and memory. Defaults to False.
-	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
 	    This parameter is required in dygraph mode. \
 	    The default value is None in static mode, at this time all parameters will be updated.
 	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
@@ -147,6 +147,12 @@ class RMSProp(Optimizer):
             raise ValueError("epsilon is not set.")
         if momentum is None:
             raise ValueError("momentum is not set.")
+        if not 0.0 <= epsilon:
+            raise ValueError("Invalid value of epsilon, expect epsilon >= 0.")
+        if not 0.0 <= momentum:
+            raise ValueError("Invalid value of momentum, expect momentum >= 0.")
+        if not 0.0 <= rho:
+            raise ValueError("Invalid value of rho, expect rho >= 0.")
 
         super(RMSProp, self).__init__(
             learning_rate=learning_rate,
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index bb3a578e15724e9501d69dc209bdedc65afeb82b..133c3dfb24fed82e4d666321585932d7e58a6f29 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -85,7 +85,7 @@ class SGD(Optimizer):
             name=name)
         self.type = "sgd"
 
-    @no_grad()
+    @no_grad
     def _append_optimize_op(self, block, param_and_grad):
         lr = self._create_param_lr(param_and_grad)
         if framework.in_dygraph_mode():
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 9ef66712540aa54eac39b7e6160c5c91b6e3fcd5..9eece1240d7d3c0b8a863091367e993047bd4527 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -73,8 +73,8 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor, paddle\.ComplexTensor.
         dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
             'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8'. And
-            'complex64' , 'complex128' only for ComplexTensor. Default: None, for float point number, 
-            get type from ``get_default_type``, for other type, infers from ``data`` .
+            'complex64' , 'complex128' only for ComplexTensor. Default: None, infers dtype from ``data`` 
+            except for python float number which gets dtype from ``get_default_type`` .
         place(CPUPlace|CUDAPinnedPlace|CUDAPlace, optional): The place to allocate Tensor. Can be  
             CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place.
         stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
@@ -188,13 +188,21 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             raise TypeError(
                 "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor|paddle.ComplexTensor".
                 format(type(data)))
+        if not dtype and data.dtype in [
+                'float16', 'float32', 'float64', 'complex64', 'complex128'
+        ]:
+            default_type = paddle.get_default_dtype()
+            if np.iscomplexobj(data):
+                default_type = 'complex64' if default_type in [
+                    'float16', 'float32'
+                ] else 'complex128'
+            data = data.astype(default_type)
+
+    if dtype and convert_dtype(dtype) != data.dtype:
+        data = data.astype(dtype)
 
     if not np.iscomplexobj(data):
-        if dtype:
-            dtype = convert_dtype(dtype)
-        elif data.dtype in ['float16', 'float32', 'float64']:
-            dtype = paddle.framework.get_default_dtype()
-        if dtype and dtype != data.dtype:
+        if dtype and convert_dtype(dtype) != data.dtype:
             data = data.astype(dtype)
         return paddle.Tensor(
             value=data,
@@ -203,14 +211,6 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             zero_copy=True,
             stop_gradient=stop_gradient)
     else:
-        if dtype:
-            dtype = convert_dtype(dtype)
-        else:
-            dtype = paddle.framework.get_default_dtype()
-            dtype = 'complex64' if dtype in ['float16', 'float32'
-                                             ] else 'complex128'
-        if dtype != data.dtype:
-            data = data.astype(dtype)
         name = unique_name.generate('generated_tensor')
         real_tensor = paddle.Tensor(
             value=data.real,
@@ -244,10 +244,6 @@ def full_like(x, fill_value, dtype=None, name=None):
     Returns:
         Tensor: Tensor which is created according to ``x``, ``fill_value`` and ``dtype``.
     
-    Raises:
-        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64 and None.
-    
     Examples:
         .. code-block:: python
 
@@ -303,11 +299,6 @@ def ones(shape, dtype=None, name=None):
     Returns:
         Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 1.
 
-    Raises:
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64 and None.
-        TypeError: The ``shape`` must be one of list, tuple and Tensor. The data type of ``shape`` must
-            be int32 or int64 when it's a Tensor.
-    
     Examples:
         .. code-block:: python
 
@@ -366,11 +357,10 @@ def ones_like(x, dtype=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
+            x = paddle.to_tensor([1,2,3])
             out1 = paddle.zeros_like(x) # [1., 1., 1.]
             out2 = paddle.zeros_like(x, dtype='int32') # [1, 1, 1]
 
@@ -392,11 +382,6 @@ def zeros(shape, dtype=None, name=None):
     Returns:
         Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 0.
 
-    Raises:
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64 and None.
-        TypeError: The ``shape`` must be one of list, tuple and Tensor. The data type of ``shape`` must
-            be int32 or int64 when it's a Tensor.
-    
     Examples:
         .. code-block:: python
 
@@ -453,11 +438,10 @@ def zeros_like(x, dtype=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
+            x = paddle.to_tensor([1,2,3])
             out1 = paddle.zeros_like(x) # [0., 0., 0.]
             out2 = paddle.zeros_like(x, dtype='int32') # [0, 0, 0]
 
@@ -482,10 +466,6 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
 
     Returns:
         Tensor: An identity Tensor or LoDTensor of shape [num_rows, num_columns].
-    
-    Raises:
-        TypeError: The ``dtype`` must be one of float16, float32, float64, int32 int64 and None.
-        TypeError: The ``num_columns`` must be non-negative int.
 
     Examples:
         .. code-block:: python
@@ -534,11 +514,6 @@ def full(shape, fill_value, dtype=None, name=None):
     Returns:
         Tensor: Tensor which is created according to ``shape``, ``fill_value`` and ``dtype``.
 
-    Raises:
-        TypeError: The ``dtype`` must be one of None, bool, float16, float32, float64, int32 and int64.
-        TypeError: The ``shape`` must be one of Tensor, list and tuple. The data type of ``shape`` must
-            be int32 or int64 when the it's a Tensor
-    
     Examples:
         .. code-block:: python
 
@@ -619,7 +594,6 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         .. code-block:: python
 
         import paddle
-        import numpy as np
 
         paddle.disable_static()
 
@@ -633,7 +607,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         out3 = paddle.arange(4.999, dtype='float32')
         # [0., 1., 2., 3., 4.]
 
-        start_var = paddle.to_tensor(np.array([3]))
+        start_var = paddle.to_tensor([3])
         out4 = paddle.arange(start_var, 7)
         # [3, 4, 5, 6]
              
@@ -725,7 +699,7 @@ def tril(x, diagonal=0, name=None):
 
             paddle.disable_static()
 
-            x = paddle.to_variable(data)
+            x = paddle.to_tensor(data)
             
             tril1 = paddle.tensor.tril(x)
             # array([[ 1,  0,  0,  0],
@@ -797,7 +771,7 @@ def triu(x, diagonal=0, name=None):
             paddle.disable_static()
 
             # example 1, default diagonal
-            x = paddle.to_variable(data)
+            x = paddle.to_tensor(data)
             triu1 = paddle.tensor.triu(x)
             # array([[ 1,  2,  3,  4],
             #        [ 0,  6,  7,  8],
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index b5b528325cd9f52a8b61ef21df0095c41da5a8ed..7ddda5091a0a260f56b29bcedfdcb0786e82ddd6 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -810,7 +810,7 @@ def cholesky(x, upper=False, name=None):
             a = np.random.rand(3, 3)
             a_t = np.transpose(a, [1, 0])
             x_data = np.matmul(a, a_t) + 1e-03
-            x = paddle.to_variable(x_data)
+            x = paddle.to_tensor(x_data)
             out = paddle.cholesky(x, upper=False)
             print(out.numpy())
             # [[1.190523   0.         0.        ]
@@ -855,15 +855,16 @@ def bmm(x, y, name=None):
     Examples:
         import paddle
 
-        # In imperative mode:
-        # size input1: (2, 2, 3) and input2: (2, 3, 2)
-        input1 = np.array([[[1.0, 1.0, 1.0],[2.0, 2.0, 2.0]],[[3.0, 3.0, 3.0],[4.0, 4.0, 4.0]]])
-        input2 = np.array([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],[[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
-
         paddle.disable_static()
-        
-        x = paddle.to_variable(input1)
-        y = paddle.to_variable(input2)
+
+        # In imperative mode:
+        # size x: (2, 2, 3) and y: (2, 3, 2)
+        x = paddle.to_tensor([[[1.0, 1.0, 1.0],
+                               [2.0, 2.0, 2.0]],
+                              [[3.0, 3.0, 3.0],
+                               [4.0, 4.0, 4.0]]])
+        y = paddle.to_tensor([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],
+                              [[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
         out = paddle.bmm(x, y)
         #output size: (2, 2, 2)
         #output value:
@@ -924,10 +925,8 @@ def histogram(input, bins=100, min=0, max=0):
     Code Example 2:
         .. code-block:: python
             import paddle
-            import numpy as np
             paddle.disable_static(paddle.CPUPlace())
-            inputs_np = np.array([1, 2, 1]).astype(np.float)
-            inputs = paddle.to_variable(inputs_np)
+            inputs = paddle.to_tensor([1, 2, 1])
             result = paddle.histogram(inputs, bins=4, min=0, max=3)
             print(result) # [0, 2, 1, 0]
             paddle.enable_static()
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 36b558d597c1ce1333a8f1eec54e2fd2813625e3..5fd714421c8ed14820738543a1824c779296d7c3 100644
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -71,13 +71,12 @@ def equal_all(x, y, name=None):
     Examples:
         .. code-block:: python
 
-          import numpy as np
           import paddle
 
           paddle.disable_static()
-          x = paddle.to_variable(np.array([1, 2, 3]))
-          y = paddle.to_variable(np.array([1, 2, 3]))
-          z = paddle.to_variable(np.array([1, 4, 3]))
+          x = paddle.to_tensor([1, 2, 3])
+          y = paddle.to_tensor([1, 2, 3])
+          z = paddle.to_tensor([1, 4, 3])
           result1 = paddle.equal_all(x, y)
           print(result1.numpy()) # result1 = [True ]
           result2 = paddle.equal_all(x, z)
@@ -120,14 +119,11 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
           paddle.disable_static()
 
-          np_x = np.array([10000., 1e-07]).astype("float32")
-          np_y = np.array([10000.1, 1e-08]).astype("float32")
-          x = paddle.to_tensor(np_x)
-          y = paddle.to_tensor(np_y)
+          x = paddle.to_tensor([10000., 1e-07])
+          y = paddle.to_tensor([10000.1, 1e-08])
           result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                   equal_nan=False, name="ignore_nan")
           np_result1 = result1.numpy()
@@ -137,10 +133,8 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
           np_result2 = result2.numpy()
           # [False]
 
-          np_x = np.array([1.0, float('nan')]).astype("float32")
-          np_y = np.array([1.0, float('nan')]).astype("float32")
-          x = paddle.to_tensor(np_x)
-          y = paddle.to_tensor(np_y)
+          x = paddle.to_tensor([1.0, float('nan')])
+          y = paddle.to_tensor([1.0, float('nan')])
           result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                   equal_nan=False, name="ignore_nan")
           np_result1 = result1.numpy()
@@ -195,12 +189,11 @@ def equal(x, y, name=None):
     Examples:
         .. code-block:: python
 
-          import numpy as np
           import paddle
 
           paddle.disable_static()
-          x = paddle.to_variable(np.array([1, 2, 3]))
-          y = paddle.to_variable(np.array([1, 3, 2]))
+          x = paddle.to_tensor([1, 2, 3])
+          y = paddle.to_tensor([1, 3, 2])
           result1 = paddle.equal(x, y)
           print(result1.numpy())  # result1 = [True False False]
     """
@@ -227,12 +220,11 @@ def greater_equal(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.greater_equal(x, y)
             print(result1.numpy())  # result1 = [True False True]
     """
@@ -259,12 +251,11 @@ def greater_than(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.greater_than(x, y)
             print(result1.numpy())  # result1 = [False False True]
     """
@@ -292,12 +283,11 @@ def less_equal(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.less_equal(x, y)
             print(result1.numpy())  # result1 = [True True False]
     """
@@ -325,12 +315,11 @@ def less_than(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.less_than(x, y)
             print(result1.numpy())  # result1 = [False True False]
     """
@@ -358,12 +347,12 @@ def not_equal(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
+
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.not_equal(x, y)
             print(result1.numpy())  # result1 = [False True True]
     """
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 845d2cf4d199328bbf8d0e03cd3a7a24a61aafd2..363c3ffceb85ef6168dc8c33b81185cac08083fb 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -85,11 +85,6 @@ def concat(x, axis=0, name=None):
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
-    Raises:
-        TypeError: ``x`` must be list or tuple.
-        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32 and int64. 
-        TypeError: The ``axis`` must be int or Tensor. The dtype of ``axis`` must be int32 or int64 when it's a Tensor.
-        TypeError: All the Tensors in ``x`` must have the same data type.
 
     Returns:
         Tensor: A Tensor with the same data type as ``x``.
@@ -98,18 +93,14 @@ def concat(x, axis=0, name=None):
         .. code-block:: python
             
             import paddle
-            import numpy as np
             
             paddle.disable_static()  # Now we are in imperative mode
-            in1 = np.array([[1, 2, 3],
-                            [4, 5, 6]])
-            in2 = np.array([[11, 12, 13],
-                            [14, 15, 16]])
-            in3 = np.array([[21, 22],
-                            [23, 24]])
-            x1 = paddle.to_tensor(in1)
-            x2 = paddle.to_tensor(in2)
-            x3 = paddle.to_tensor(in3)
+            x1 = paddle.to_tensor([[1, 2, 3],
+                                   [4, 5, 6]])
+            x2 = paddle.to_tensor([[11, 12, 13],
+                                   [14, 15, 16]])
+            x3 = paddle.to_tensor([[21, 22],
+                                   [23, 24]])
             zero = paddle.full(shape=[1], dtype='int32', fill_value=0)
             # When the axis is negative, the real axis is (axis + Rank(x))
             # As follow, axis is -1, Rank(x) is 2, the real axis is 1
@@ -158,7 +149,7 @@ def flip(x, axis, name=None):
           image_shape=(3, 2, 2)
           x = np.arange(image_shape[0] * image_shape[1] * image_shape[2]).reshape(image_shape)
           x = x.astype('float32')
-          img = paddle.to_variable(x)
+          img = paddle.to_tensor(x)
           out = paddle.flip(img, [0,1])
 
           print(out) # [[[10,11][8, 9]],[[6, 7],[4, 5]] [[2, 3],[0, 1]]]
@@ -250,7 +241,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
             x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3]).reshape(image_shape) / 100.
             x = x.astype('float32')
             
-            img = paddle.to_variable(x)
+            img = paddle.to_tensor(x)
             out = paddle.flatten(img, start_axis=1, stop_axis=2)
             # out shape is [2, 12, 4]
     """
@@ -315,15 +306,13 @@ def roll(x, shifts, axis=None, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
             import paddle.fluid as fluid
 
-            data = np.array([[1.0, 2.0, 3.0],
-                             [4.0, 5.0, 6.0],
-                             [7.0, 8.0, 9.0]])
             paddle.disable_static()
-            x = paddle.to_variable(data)
+            x = paddle.to_tensor([[1.0, 2.0, 3.0],
+                                  [4.0, 5.0, 6.0],
+                                  [7.0, 8.0, 9.0]])
             out_z1 = paddle.roll(x, shifts=1)
             print(out_z1.numpy())
             #[[9. 1. 2.]
@@ -433,8 +422,7 @@ def stack(x, axis=0, name=None):
                           [5.0, 6.0] ] ]
 
     Args:
-        x (Tensor|list[Tensor]): Input ``x`` can be a single tensor, or a ``list`` of tensors.
-                                     If ``x`` is a ``list``, the Tensors in ``x``
+        x (list[Tensor]|tuple[Tensor]): Input ``x`` can be a ``list`` or ``tuple`` of tensors, the Tensors in ``x``
                                      must be of the same shape and dtype. Supported data types: float32, float64, int32, int64.
         axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
                               where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
@@ -448,17 +436,11 @@ def stack(x, axis=0, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-
-            data1 = np.array([[1.0, 2.0]])
-            data2 = np.array([[3.0, 4.0]])
-            data3 = np.array([[5.0, 6.0]])
-
+            
             paddle.disable_static()
-            x1 = paddle.to_variable(data1)
-            x2 = paddle.to_variable(data2)
-            x3 = paddle.to_variable(data3)
-
+            x1 = paddle.to_tensor([[1.0, 2.0]])
+            x2 = paddle.to_tensor([[3.0, 4.0]])
+            x3 = paddle.to_tensor([[5.0, 6.0]])
             out = paddle.stack([x1, x2, x3], axis=0)
             print(out.shape)  # [3, 1, 2]
             print(out.numpy())
@@ -487,10 +469,7 @@ def split(x, num_or_sections, axis=0, name=None):
             For more information, please refer to :ref:`api_guide_Name` .
     Returns:
         list(Tensor): The list of segmented Tensors.
-    Raises:
-        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: ``num_or_sections`` is not int, list or tuple.
-        TypeError: ``axis`` is not int or Tensor. the data type of ``axis`` must be int32 or int64 when it's a Tensor.
+    
     Example:
         .. code-block:: python
             
@@ -638,12 +617,10 @@ def unique(x,
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x_data = np.array([2, 3, 3, 1, 5, 3])
-            x = paddle.to_tensor(x_data)
+            x = paddle.to_tensor([2, 3, 3, 1, 5, 3])
             unique = paddle.unique(x)
             np_unique = unique.numpy() # [1 2 3 5]
             _, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True)
@@ -651,8 +628,7 @@ def unique(x,
             np_inverse = inverse.numpy() # [1 2 2 0 3 2]
             np_counts = counts.numpy() # [1 1 3 1]
 
-            x_data = np.array([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
-            x = paddle.to_tensor(x_data)
+            x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
             unique = paddle.unique(x)
             np_unique = unique.numpy() # [0 1 2 3]
 
@@ -812,23 +788,15 @@ def gather(x, index, axis=None, name=None):
     Returns:
         output (Tensor): The output is a tensor with the same rank as ``x``.
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must to be one of float16, float32, float64, int32, int64, uint8.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
-        TypeError: ``axis`` must be a Tensor or int and the data type of ``index`` must be int32 or int64 when it's a Tensor.
-
     Examples:
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            input_1 = np.array([[1,2],[3,4],[5,6]])
-            index_1 = np.array([0,1])
-            input = paddle.to_tensor(input_1)
-            index = paddle.to_tensor(index_1)
+            input = paddle.to_tensor([[1,2],[3,4],[5,6]])
+            index = paddle.to_tensor([0,1])
             output = paddle.gather(input, index, axis=0)
             # expected output: [[1,2],[3,4]]
     """
@@ -964,16 +932,11 @@ def scatter(x, index, updates, overwrite=True, name=None):
         .. code-block:: python
             
             import paddle
-            import numpy as np
             paddle.disable_static()
 
-            x_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float32)
-            index_data = np.array([2, 1, 0, 1]).astype(np.int64)
-            updates_data = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float32)
-            
-            x = paddle.to_tensor(x_data)
-            index = paddle.to_tensor(index_data)
-            updates = paddle.to_tensor(updates_data)
+            x = paddle.to_tensor([[1, 1], [2, 2], [3, 3]], dtype='float32')
+            index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+            updates = paddle.to_tensor([[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
   
             output1 = paddle.scatter(x, index, updates, overwrite=False)
             # [[3., 3.],
@@ -1026,10 +989,7 @@ def chunk(x, chunks, axis=0, name=None):
             For more information, please refer to :ref:`api_guide_Name` .
     Returns:
         list(Tensor): The list of segmented Tensors.
-    Raises:
-        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: ``chunks`` is not int.
-        TypeError: ``axis`` is not int or Tensor. the data type of ``axis`` must be int32 or int64 when it's a Tensor.
+    
     Example:
         .. code-block:: python
             
@@ -1080,11 +1040,9 @@ def tile(x, repeat_times, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            np_data = np.array([1, 2, 3]).astype('int32')
-            data = paddle.to_tensor(np_data)
+            data = paddle.to_tensor([1, 2, 3], dtype='int32')
             out = paddle.tile(data, repeat_times=[2, 1])
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
@@ -1093,8 +1051,7 @@ def tile(x, repeat_times, name=None):
             np_out = out.numpy()
             # [[1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3]]
 
-            np_repeat_times = np.array([2, 1]).astype("int32")
-            repeat_times = paddle.to_tensor(np_repeat_times)
+            repeat_times = paddle.to_tensor([2, 1], dtype='int32')
             out = paddle.tile(data, repeat_times=repeat_times)
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
@@ -1162,15 +1119,12 @@ def expand_as(x, y, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
 
-            np_data_x = np.array([1, 2, 3]).astype('int32')
-            np_data_y = np.array([[1, 2, 3], [4, 5, 6]]).astype('int32')
-            data_x = paddle.to_tensor(np_data_x)
-            data_y = paddle.to_tensor(np_data_y)
+            data_x = paddle.to_tensor([1, 2, 3], 'int32')
+            data_y = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], 'int32')
             out = paddle.expand_as(data_x, data_y)
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
@@ -1218,12 +1172,10 @@ def expand(x, shape, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            np_data = np.array([1, 2, 3]).astype('int32')
-            data = paddle.to_tensor(np_data)
+            data = paddle.to_tensor([1, 2, 3], dtype='int32')
             out = paddle.expand(data, shape=[2, 3])
             out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
@@ -1322,11 +1274,6 @@ def reshape(x, shape, name=None):
     Returns:
         Tensor: A reshaped Tensor with the same data type as ``x``.
 
-    Raises:
-        ValueError: If more than one elements of ``shape`` is -1.
-        ValueError: If the element of ``shape`` is 0, the corresponding dimension should be less than or equal to the dimension of ``x``.
-        ValueError: If the elements in ``shape`` is negative except -1.
-
     Examples:
         .. code-block:: python
 
@@ -1413,23 +1360,16 @@ def gather_nd(x, index, name=None):
     Returns:
         output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of float32, float64, int32 and int64.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be one of int32 and int64.
-
     Examples:
 
         .. code-block:: python
             
             import paddle
-            import numpy as np
             
             paddle.disable_static()
-            np_x = np.array([[[1, 2], [3, 4], [5, 6]],
-                             [[7, 8], [9, 10], [11, 12]]])
-            np_index = [[0, 1]]
-            x = paddle.to_tensor(np_x)
-            index = paddle.to_tensor(np_index)
+            x = paddle.to_tensor([[[1, 2], [3, 4], [5, 6]],
+                                  [[7, 8], [9, 10], [11, 12]]])
+            index = paddle.to_tensor([[0, 1]])
             
             output = paddle.gather_nd(x, index) #[[3, 4]]
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index d2db2a7cb71945e137e46d6793f8cba1f7adf12f..ed2bbe03a366054dfe7d798310c7fa5d419b44a8 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -174,14 +174,12 @@ def pow(x, y, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
             
             # example 1: y is a float
-            x_data = np.array([1, 2, 3])
+            x = paddle.to_tensor([1, 2, 3])
             y = 2
-            x = paddle.to_tensor(x_data)
             res = paddle.pow(x, y)
             print(res.numpy()) # [1 4 9]
             
@@ -291,13 +289,10 @@ Examples:
     ..  code-block:: python
 
         import paddle
-        import numpy as np
 
         paddle.disable_static()
-        np_x = np.array([2, 3, 4]).astype('float64')
-        np_y = np.array([1, 5, 2]).astype('float64')
-        x = paddle.to_variable(np_x)
-        y = paddle.to_variable(np_y)
+        x = paddle.to_tensor([2, 3, 4], 'float64')
+        y = paddle.to_tensor([1, 5, 2], 'float64')
         z = paddle.add(x, y)
         np_z = z.numpy()
         print(np_z)  # [3., 8., 6. ]
@@ -335,14 +330,11 @@ def divide(x, y, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            np_x = np.array([2, 3, 4]).astype('float64')
-            np_y = np.array([1, 5, 2]).astype('float64')
-            x = paddle.to_tensor(np_x)
-            y = paddle.to_tensor(np_y)
+            x = paddle.to_tensor([2, 3, 4], dtype='float64')
+            y = paddle.to_tensor([1, 5, 2], dtype='float64')
             z = paddle.divide(x, y)
             print(z.numpy())  # [2., 0.6, 2.]
 
@@ -440,14 +432,11 @@ def floor_divide(x, y, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            np_x = np.array([2, 3, 8, 7])
-            np_y = np.array([1, 5, 3, 3])
-            x = paddle.to_tensor(np_x)
-            y = paddle.to_tensor(np_y)
+            x = paddle.to_tensor([2, 3, 8, 7])
+            y = paddle.to_tensor([1, 5, 3, 3])
             z = paddle.floor_divide(x, y)
             print(z.numpy())  # [2, 0, 2, 2]
 
@@ -530,14 +519,11 @@ def remainder(x, y, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            np_x = np.array([2, 3, 8, 7])
-            np_y = np.array([1, 5, 3, 3])
-            x = paddle.to_tensor(np_x)
-            y = paddle.to_tensor(np_y)
+            x = paddle.to_tensor([2, 3, 8, 7])
+            y = paddle.to_tensor([1, 5, 3, 3])
             z = paddle.remainder(x, y)
             print(z.numpy())  # [0, 3, 2, 1]
 
@@ -612,20 +598,15 @@ def multiply(x, y, axis=-1, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
-            y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-            x = paddle.to_tensor(x_data)
-            y = paddle.to_tensor(y_data)
+            x = paddle.to_tensor([[1, 2], [3, 4]])
+            y = paddle.to_tensor([[5, 6], [7, 8]])
             res = paddle.multiply(x, y)
             print(res.numpy()) # [[5, 12], [21, 32]]
 
-            x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
-            y_data = np.array([1, 2], dtype=np.float32)
-            x = paddle.to_tensor(x_data)
-            y = paddle.to_tensor(y_data)
+            x = paddle.to_tensor([[[1, 2, 3], [1, 2, 3]]])
+            y = paddle.to_tensor([1, 2])
             res = paddle.multiply(x, y, axis=1)
             print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
 
@@ -654,36 +635,28 @@ Examples:
 
         paddle.disable_static()
   
-        x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
-        y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([[1, 2], [3, 4]])
+        y = paddle.to_tensor([[5, 6], [7, 8]])
         res = paddle.maximum(x, y)
         print(res.numpy())
         #[[5. 6.]
         # [7. 8.]]
 
-        x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
-        y_data = np.array([1, 2], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([[[1, 2, 3], [1, 2, 3]]])
+        y = paddle.to_tensor([1, 2])
         res = paddle.maximum(x, y, axis=1)
         print(res.numpy())
         #[[[1. 2. 3.]
         #  [2. 2. 3.]]]
 
-        x_data = np.array([2, 3, 5], dtype=np.float32)
-        y_data = np.array([1, 4, np.nan], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([2, 3, 5], dtype='float32')
+        y = paddle.to_tensor([1, 4, np.nan], dtype='float32')
         res = paddle.maximum(x, y)
         print(res.numpy())
         #[ 2.  4. nan]
 
-        x_data = np.array([5, 3, np.inf], dtype=np.float32)
-        y_data = np.array([1, 4, 5], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([5, 3, np.inf], dtype='float32')
+        y = paddle.to_tensor([1, 4, 5], dtype='float32')
         res = paddle.maximum(x, y)
         print(res.numpy())
         #[ 5.  4. inf]
@@ -703,38 +676,31 @@ Examples:
 
         import paddle
         import numpy as np
+
         paddle.disable_static()
   
-        x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
-        y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([[1, 2], [3, 4]], dtype='float32')
+        y = paddle.to_tensor([[5, 6], [7, 8]], dtype='float32')
         res = paddle.minimum(x, y)
         print(res.numpy())
         #[[1. 2.]
         # [3. 4.]]
 
-        x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
-        y_data = np.array([1, 2], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([[[1, 2, 3], [1, 2, 3]]], dtype='float32')
+        y = paddle.to_tensor([1, 2], dtype='float32')
         res = paddle.minimum(x, y, axis=1)
         print(res.numpy())
         #[[[1. 1. 1.]
         #  [2. 2. 2.]]]
 
-        x_data = np.array([2, 3, 5], dtype=np.float32)
-        y_data = np.array([1, 4, np.nan], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([2, 3, 5], dtype='float32')
+        y = paddle.to_tensor([1, 4, np.nan], dtype='float32')
         res = paddle.minimum(x, y)
         print(res.numpy())
         #[ 1.  3. nan]
 
-        x_data = np.array([5, 3, np.inf], dtype=np.float32)
-        y_data = np.array([1, 4, 5], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([5, 3, np.inf], dtype='float32')
+        y = paddle.to_tensor([1, 4, 5], dtype='float32')
         res = paddle.minimum(x, y)
         print(res.numpy())
         #[1. 3. 5.]
@@ -794,33 +760,33 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
         it's data type is the same as `x`.
 
     Raises:
-        ValueError: The :attr:`dtype` must be float64 or int64.
+        ValueError: If the data type of `x` is float64, :attr:`dtype` can not be float32 or int32.
+        ValueError: If the data type of `x` is int64, :attr:`dtype` can not be int32.
         TypeError: The type of :attr:`axis` must be int, list or tuple.
 
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
             paddle.disable_static()
 
-            # x is a Tensor variable with following elements:
+            # x is a Tensor with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
             # Each example is followed by the corresponding output tensor.
-            x_data = np.array([[0.2, 0.3, 0.5, 0.9],[0.1, 0.2, 0.6, 0.7]]).astype('float32')
-            x = paddle.to_variable(x_data)
+            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+                                  [0.1, 0.2, 0.6, 0.7]])
             out1 = paddle.sum(x)  # [3.5]
             out2 = paddle.sum(x, axis=0)  # [0.3, 0.5, 1.1, 1.6]
             out3 = paddle.sum(x, axis=-1)  # [1.9, 1.6]
             out4 = paddle.sum(x, axis=1, keepdim=True)  # [[1.9], [1.6]]
 
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
+            # y is a Tensor with shape [2, 2, 2] and elements as below:
             #      [[[1, 2], [3, 4]],
             #      [[5, 6], [7, 8]]]
             # Each example is followed by the corresponding output tensor.
-            y_data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]).astype('float32')
-            y = paddle.to_variable(y_data)
+            y = paddle.to_tensor([[[1, 2], [3, 4]], 
+                                  [[5, 6], [7, 8]]])
             out5 = paddle.sum(y, axis=[1, 2]) # [10, 26]
             out6 = paddle.sum(y, axis=[0, 1]) # [16, 20]
     """
@@ -850,10 +816,6 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
                     'out_dtype': convert_np_dtype_to_dtype_(dtype)
                 })
                 dtype_flag = True
-        else:
-            raise ValueError(
-                "The value of 'dtype' in sum op must be float64, int64, but received of {}".
-                format(dtype))
 
     if in_dygraph_mode():
         axis = axis if axis != None and axis != [] else [0]
@@ -867,6 +829,17 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
                                        'reduce_all', reduce_all_flag)
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int32', 'int64'], 'sum')
+
+    if dtype is not None:
+        check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], 'sum')
+        x_dtype = convert_dtype(x.dtype)
+
+        if (x_dtype == "float64" and dtype in ["float32", "int32"]) or \
+                (x_dtype == "int64" and dtype == "int32"):
+            raise ValueError("The input(x)'s dtype is {} but the attr(dtype) of sum is {}, "
+                             "which may cause data type overflows. Please reset attr(dtype) of sum."
+                             .format(x_dtype, dtype))
+
     check_type(axis, 'axis', (int, list, tuple, type(None)), 'sum')
 
     helper = LayerHelper('sum', **locals())
@@ -1121,9 +1094,9 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 
             paddle.disable_static()
 
-            x = paddle.to_variable(data_x)
-            y = paddle.to_variable(data_y)
-            input = paddle.to_variable(data_input)
+            x = paddle.to_tensor(data_x)
+            y = paddle.to_tensor(data_y)
+            input = paddle.to_tensor(data_input)
 
             out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
 
@@ -1204,12 +1177,10 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
     .. code-block:: python
 
         import paddle
-        import numpy as np
 
         paddle.disable_static()
 
-        x = np.array([[-1.5, 0., 2.], [3., 1.2, -2.4]])
-        x = paddle.to_tensor(x)
+        x = paddle.to_tensor([[-1.5, 0., 2.], [3., 1.2, -2.4]])
         out1 = paddle.logsumexp(x) # [3.4691226]
         out2 = paddle.logsumexp(x, 1) # [2.15317821, 3.15684602]
 
@@ -1260,12 +1231,10 @@ def inverse(x, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
-
-            mat_np = np.array([[2, 0], [0, 2]]).astype("float32")
             paddle.disable_static()
-            mat = paddle.to_variable(mat_np)
+
+            mat = paddle.to_tensor([[2, 0], [0, 2]], dtype='float32')
             inv = paddle.inverse(mat)
             print(inv) # [[0.5, 0], [0, 0.5]]
 
@@ -1316,16 +1285,15 @@ def max(x, axis=None, keepdim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
 
             # data_x is a variable with shape [2, 4]
             # the axis is a int element
-            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
-                               [0.1, 0.2, 0.6, 0.7]])
-            x = paddle.to_variable(data_x)
+
+            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+                                  [0.1, 0.2, 0.6, 0.7]])
             result1 = paddle.max(x)
             print(result1.numpy())
             #[0.9]
@@ -1342,9 +1310,9 @@ def max(x, axis=None, keepdim=False, name=None):
 
             # data_y is a variable with shape [2, 2, 2]
             # the axis is list 
-            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
-                               [[5.0, 6.0], [7.0, 8.0]]])
-            y = paddle.to_variable(data_y)
+
+            y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
+                                  [[5.0, 6.0], [7.0, 8.0]]])
             result5 = paddle.max(y, axis=[1, 2])
             print(result5.numpy())
             #[4. 8.]
@@ -1411,16 +1379,14 @@ def min(x, axis=None, keepdim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
 
-            # data_x is a variable with shape [2, 4]
+            # x is a tensor with shape [2, 4]
             # the axis is a int element
-            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
-                            [0.1, 0.2, 0.6, 0.7]])
-            x = paddle.to_variable(data_x)
+            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+                                  [0.1, 0.2, 0.6, 0.7]])
             result1 = paddle.min(x)
             print(result1.numpy())
             #[0.1]
@@ -1435,11 +1401,10 @@ def min(x, axis=None, keepdim=False, name=None):
             #[[0.2]
             # [0.1]]
 
-            # data_y is a variable with shape [2, 2, 2]
+            # y is a variable with shape [2, 2, 2]
             # the axis is list 
-            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
-                               [[5.0, 6.0], [7.0, 8.0]]])
-            y = paddle.to_variable(data_y)
+            y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
+                                  [[5.0, 6.0], [7.0, 8.0]]])
             result5 = paddle.min(y, axis=[1, 2])
             print(result5.numpy()) 
             #[1. 5.]
@@ -1596,11 +1561,9 @@ def clip(x, min=None, max=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            x = np.array([[1.2,3.5], [4.5,6.4]]).astype('float32')
-            x1 = paddle.to_variable(x)
+            x1 = paddle.to_tensor([[1.2, 3.5], [4.5, 6.4]], 'float32')
             out1 = paddle.clip(x1, min=3.5, max=5.0)
             out2 = paddle.clip(x1, min=2.5)
             print(out1.numpy())
@@ -1611,11 +1574,8 @@ def clip(x, min=None, max=None, name=None):
             # [[4.5, 6.4]
     """
 
-    np_dtype = np.float32
-    if x.dtype == VarDesc.VarType.FP64:
-        np_dtype = np.float64
-    fmin = float(np.finfo(np_dtype).min)
-    fmax = float(np.finfo(np_dtype).max)
+    fmin = float(np.finfo(np.float32).min)
+    fmax = float(np.finfo(np.float32).max)
 
     if in_dygraph_mode():
         if isinstance(min, Variable):
@@ -1656,7 +1616,7 @@ def clip(x, min=None, max=None, name=None):
 
     helper = LayerHelper('clip', **locals())
     output = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
+        dtype=helper.input_dtype('x'))
     helper.append_op(
         type='clip', inputs=inputs, outputs={'Out': [output]}, attrs=attrs)
 
@@ -1704,9 +1664,9 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None):
 
             paddle.disable_static()
 
-            case1 = paddle.to_variable(case1)
-            case2 = paddle.to_variable(case2)
-            case3 = paddle.to_variable(case3)
+            case1 = paddle.to_tensor(case1)
+            case2 = paddle.to_tensor(case2)
+            case3 = paddle.to_tensor(case3)
             data1 = paddle.trace(case1) # data1.shape = [1]
             data2 = paddle.trace(case2, offset=1, axis1=1, axis2=2) # data2.shape = [3]
             data3 = paddle.trace(case3, offset=-3, axis1=1, axis2=-1) # data2.shape = [3, 5]
@@ -1897,10 +1857,8 @@ def isfinite(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
             paddle.disable_static()
-            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
-            x = paddle.to_tensor(x_np)
+            x = paddle.to_tensor([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
             out = paddle.tensor.isfinite(x)
             print(out.numpy())  # [False  True  True False  True False False]
     """
@@ -1928,10 +1886,8 @@ def isinf(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
             paddle.disable_static()
-            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
-            x = paddle.to_tensor(x_np)
+            x = paddle.to_tensor([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
             out = paddle.tensor.isinf(x)
             print(out.numpy())  # [ True False False  True False False False]
     """
@@ -1959,10 +1915,8 @@ def isnan(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
             paddle.disable_static()
-            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
-            x = paddle.to_tensor(x_np)
+            x = paddle.to_tensor([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
             out = paddle.tensor.isnan(x)
             print(out.numpy())  # [False False False False False  True  True]
     """
@@ -2005,14 +1959,12 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
             # the axis is a int element
-            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
-                         [0.1, 0.2, 0.6, 0.7]]).astype(np.float32)
-            x = paddle.to_tensor(data_x)
+            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+                                  [0.1, 0.2, 0.6, 0.7]])
             out1 = paddle.prod(x)
             print(out1.numpy())
             # [0.0002268]
@@ -2038,9 +1990,8 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
             # int64
 
             # the axis is list
-            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
-                               [[5.0, 6.0], [7.0, 8.0]]])
-            y = paddle.to_tensor(data_y)
+            y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
+                                  [[5.0, 6.0], [7.0, 8.0]]])
             out6 = paddle.prod(y, [0, 1])
             print(out6.numpy())
             # [105. 384.]
@@ -2073,12 +2024,10 @@ def sign(x, name=None):
     Examples:
         .. code-block:: python
 
-          import numpy as np
           import paddle
 
-          data = np.array([3.0, 0.0, -2.0, 1.7], dtype='float32')
           paddle.disable_static()
-          x = paddle.to_tensor(data)
+          x = paddle.to_tensor([3.0, 0.0, -2.0, 1.7], dtype='float32')
           out = paddle.sign(x=x)
           print(out)  # [1.0, 0.0, -1.0, 1.0]
     """
@@ -2113,12 +2062,9 @@ def tanh(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-
-            x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-            x = paddle.to_tensor(x_data)
+            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
             out = paddle.tanh(x)
             print(out.numpy())
             # [-0.37994896 -0.19737532  0.09966799  0.29131261]
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 6b08599fad1dfc6b5d60c3798bba802a5ddefd02..b38a1d0f5b7e92b0eac907170aad76a2b5c69bc1 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -14,17 +14,12 @@
 
 # TODO: define random functions  
 
-import numpy as np
-
 from ..fluid import core
-from ..fluid.framework import device_guard, in_dygraph_mode, _varbase_creator, Variable, convert_np_dtype_to_dtype_
-from ..fluid.layers.layer_function_generator import templatedoc
+from ..fluid.framework import in_dygraph_mode, Variable, convert_np_dtype_to_dtype_
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, check_shape
 from ..fluid.layers import utils
-from ..fluid.layers.tensor import fill_constant
 import paddle
-import warnings
 
 from ..fluid.io import shuffle  #DEFINE_ALIAS
 
@@ -65,7 +60,6 @@ def bernoulli(x, name=None):
         .. code-block:: python
 
         import paddle
-        import numpy as np
 
         paddle.disable_static()
 
@@ -94,26 +88,26 @@ def bernoulli(x, name=None):
     return out
 
 
-def gaussian_random(shape, mean=0.0, std=1.0, dtype=None, name=None):
+def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
     """
     This OP returns a Tensor filled with random values sampled from a Gaussian
     distribution, with ``shape`` and ``dtype``.
 
     Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
             is a list or tuple, the elements of it should be integers or Tensors
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        mean(float|int, optional): Mean of the output tensor, default is 0.0.
-        std(float|int, optional): Standard deviation of the output tensor, default
+        mean (float|int, optional): Mean of the output tensor, default is 0.0.
+        std (float|int, optional): Standard deviation of the output tensor, default
             is 1.0.
-        seed(int, optional): ${seed_comment}
-        dtype(str|np.dtype, optional): The data type of the output Tensor.
+        seed (int, optional): Random seed of generator.
+        dtype (str|np.dtype, optional): The data type of the output Tensor.
             Supported data types: float32, float64.
             Default is None, use global default dtype (see ``get_default_dtype``
             for details).
-        name(str, optional): The default value is None. Normally there is no
+        name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
@@ -121,26 +115,26 @@ def gaussian_random(shape, mean=0.0, std=1.0, dtype=None, name=None):
         Tensor: A Tensor filled with random values sampled from a Gaussian
         distribution, with ``shape`` and ``dtype``. 
     """
+    op_type_for_check = 'gaussian/standard_normal/randn/normal'
+    seed = 0
+
     if dtype is None:
         dtype = paddle.framework.get_default_dtype()
         if dtype not in ['float32', 'float64']:
             raise TypeError(
-                "gaussian_random only supports [float32, float64], but the default dtype is %s"
-                % dtype)
-
+                "{} only supports [float32, float64], but the default dtype is {}"
+                .format(op_type_for_check, dtype))
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
-    seed = 0
-    op_type_for_check = 'gaussian_random/standard_normal/randn/normal'
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.gaussian_random('shape', shape, 'mean',
                                         float(mean), 'std',
                                         float(std), 'seed', seed, 'dtype',
                                         dtype)
 
-    check_type(shape, 'shape', (list, tuple, Variable), op_type_for_check)
+    check_shape(shape, op_type_for_check)
     check_dtype(dtype, 'dtype', ['float32', 'float64'], op_type_for_check)
 
     inputs = {}
@@ -151,10 +145,10 @@ def gaussian_random(shape, mean=0.0, std=1.0, dtype=None, name=None):
         'dtype': dtype,
         'use_mkldnn': False
     }
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs, attrs=attrs, shape=shape, op_type=op_type_for_check)
 
-    helper = LayerHelper('gaussian_random', **locals())
+    helper = LayerHelper('gaussian', **locals())
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='gaussian_random',
@@ -172,12 +166,12 @@ def standard_normal(shape, dtype=None, name=None):
     and ``dtype``.
 
     Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
             is a list or tuple, the elements of it should be integers or Tensors
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        dtype(str|np.dtype, optional): The data type of the output Tensor.
+        dtype (str|np.dtype, optional): The data type of the output Tensor.
             Supported data types: float32, float64.
             Default is None, use global default dtype (see ``get_default_dtype``
             for details).
@@ -189,27 +183,22 @@ def standard_normal(shape, dtype=None, name=None):
         normal distribution with mean 0 and standard deviation 1, with
         ``shape`` and ``dtype``.
 
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        TypeError: If ``dtype`` is not float32, float64.
-
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
             # example 1: attr shape is a list which doesn't contain Tensor.
-            result_1 = paddle.standard_normal(shape=[2, 3])
+            out1 = paddle.standard_normal(shape=[2, 3])
             # [[-2.923464  ,  0.11934398, -0.51249987],  # random
             #  [ 0.39632758,  0.08177969,  0.2692008 ]]  # random
 
             # example 2: attr shape is a list which contains Tensor.
-            dim_1 = paddle.fill_constant([1], "int64", 2)
-            dim_2 = paddle.fill_constant([1], "int32", 3)
-            result_2 = paddle.standard_normal(shape=[dim_1, dim_2, 2])
+            dim1 = paddle.full([1], 2, "int64")
+            dim2 = paddle.full([1], 3, "int32")
+            out2 = paddle.standard_normal(shape=[dim1, dim2, 2])
             # [[[-2.8852394 , -0.25898588],  # random
             #   [-0.47420555,  0.17683524],  # random
             #   [-0.7989969 ,  0.00754541]],  # random
@@ -218,21 +207,14 @@ def standard_normal(shape, dtype=None, name=None):
             #   [ 0.8086993 ,  0.6868893 ]]]  # random
 
             # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-            var_shape = paddle.to_tensor(np.array([2, 3]))
-            result_3 = paddle.standard_normal(var_shape)
+            shape_tensor = paddle.to_tensor([2, 3])
+            result_3 = paddle.standard_normal(shape_tensor)
+
             # [[-2.878077 ,  0.17099959,  0.05111201]  # random
             #  [-0.3761474, -1.044801  ,  1.1870178 ]]  # random
 
     """
-    if dtype is None:
-        dtype = paddle.framework.get_default_dtype()
-        if dtype not in ['float32', 'float64']:
-            raise TypeError(
-                "standard_normal only supports [float32, float64], but the default dtype is %s"
-                % dtype)
-
-    return gaussian_random(
-        shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
+    return gaussian(shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
 
 
 randn = standard_normal
@@ -275,7 +257,6 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
@@ -283,11 +264,11 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
             # [[ 0.17501129  0.32364586  1.561118  ]  # random
             #  [-1.7232178   1.1545963  -0.76156676]]  # random
 
-            mean_tensor = paddle.to_tensor(np.array([1.0, 2.0, 3.0]))
+            mean_tensor = paddle.to_tensor([1.0, 2.0, 3.0])
             out2 = paddle.normal(mean=mean_tensor)
             # [ 0.18644847 -1.19434458  3.93694787]  # random
 
-            std_tensor = paddle.to_tensor(np.array([1.0, 2.0, 3.0]))
+            std_tensor = paddle.to_tensor([1.0, 2.0, 3.0])
             out3 = paddle.normal(mean=mean_tensor, std=std_tensor)
             # [1.00780561 3.78457445 5.81058198]  # random
 
@@ -306,16 +287,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
                 "If std is Tensor, it's data type only support float32, float64."
             )
         if shape is not None:
-            if isinstance(shape, (list, tuple)):
-                for item in shape:
-                    check_type(item, 'shape', (int), 'normal',
-                               'Elements of shape should be int.')
-            elif isinstance(shape, Variable):
-                check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'normal')
-            else:
-                assert TypeError(
-                    'If mean and std are all not Tensor, shape should be list, tuple, Tensor.'
-                )
+            check_shape(shape, 'normal')
 
     if isinstance(mean, Variable):
         if isinstance(std, Variable):
@@ -330,7 +302,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
         mean = float(mean)
         out = standard_normal(paddle.shape(std), std.dtype, name)
     else:
-        return gaussian_random(shape=shape, mean=mean, std=std, name=name)
+        return gaussian(shape=shape, mean=mean, std=std, name=name)
 
     out = out * std + mean
     if not in_dygraph_mode():
@@ -383,7 +355,6 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     Examples:
         .. code-block:: python
             
-            import numpy as np
             import paddle
 
             paddle.disable_static()
@@ -405,8 +376,7 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
 
             # example 3:
             # attr shape is a Tensor, the data type must be int64 or int32.
-            shape = np.array([2, 3])
-            shape_tensor = paddle.to_tensor(shape)
+            shape_tensor = paddle.to_tensor([2, 3])
             result_3 = paddle.tensor.random.uniform(shape_tensor)
             # if shape_tensor's value is [2, 3]
             # result_3 is:
@@ -419,27 +389,27 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
         dtype = paddle.framework.get_default_dtype()
         if dtype not in ['float32', 'float64']:
             raise TypeError(
-                "uniform only supports [float32, float64], but the default dtype is %s"
-                % dtype)
+                "uniform/rand only supports [float32, float64], but the default dtype is {}".
+                format(dtype))
 
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.uniform_random('shape', shape, 'min',
                                        float(min), 'max',
                                        float(max), 'seed', seed, 'dtype', dtype)
 
-    check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
-    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform_random/rand')
+    check_type(shape, 'shape', (list, tuple, Variable), 'uniform/rand')
+    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform/rand')
 
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
-    utils._get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand')
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform/rand')
 
-    helper = LayerHelper("uniform_random", **locals())
+    helper = LayerHelper("uniform", **locals())
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="uniform_random", inputs=inputs, attrs=attrs,
@@ -449,29 +419,26 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
 
 def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     """
-	:alias_main: paddle.randint
-	:alias: paddle.tensor.randint, paddle.tensor.random.randint
-
     This OP returns a Tensor filled with random integers from a discrete uniform
     distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``.
     If ``high`` is None (the default), the range is [0, ``low``).
 
     Args:
-        low(int): The lower bound on the range of random values to generate.
+        low (int): The lower bound on the range of random values to generate.
             The ``low`` is included in the range. If ``high`` is None, the
             range is [0, ``low``). Default is 0.
-        high(int, optional): The upper bound on the range of random values to
+        high (int, optional): The upper bound on the range of random values to
             generate, the ``high`` is excluded in the range. Default is None
             (see above for behavior if high = None). Default is None.
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
             is a list or tuple, the elements of it should be integers or Tensors
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64). Default is [1].
-        dtype(str|np.dtype, optional): The data type of the
+        dtype (str|np.dtype, optional): The data type of the
             output tensor. Supported data types: int32, int64. If ``dytpe``
             is None, the data type is int64. Default is None.
-        name(str, optional): The default value is None.  Normally there is no
+        name (str, optional): The default value is None.  Normally there is no
             need for user to set this property.  For more information, please
             refer to :ref:`api_guide_Name`.
 
@@ -479,48 +446,43 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
         Tensor: A Tensor filled with random integers from a discrete uniform
         distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``.
 
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        TypeError: If ``dtype`` is not int32, int64.
-        ValueError: If ``high`` is not greater then ``low``; If ``high`` is 
-            None, and ``low`` is not greater than 0.
-
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
             # example 1:
             # attr shape is a list which doesn't contain Tensor.
-            result_1 = paddle.randint(low=-5, high=5, shape=[3])
+            out1 = paddle.randint(low=-5, high=5, shape=[3])
             # [0, -3, 2]  # random
 
             # example 2:
             # attr shape is a list which contains Tensor.
-            dim_1 = paddle.fill_constant([1], "int64", 2)
-            dim_2 = paddle.fill_constant([1], "int32", 3)
-            result_2 = paddle.randint(low=-5, high=5, shape=[dim_1, dim_2], dtype="int32")
+            dim1 = paddle.full([1], 2, "int64")
+            dim2 = paddle.full([1], 3, "int32")
+            out2 = paddle.randint(low=-5, high=5, shape=[dim1, dim2], dtype="int32")
             # [[0, -1, -3],  # random
             #  [4, -2,  0]]  # random
 
             # example 3:
             # attr shape is a Tensor
-            var_shape = paddle.to_variable(np.array([3]))
-            result_3 = paddle.randint(low=-5, high=5, shape=var_shape)
+
+            shape_tensor = paddle.to_tensor(3)
+            result_3 = paddle.randint(low=-5, high=5, shape=shape_tensor)
+
             # [-2, 2, 3]  # random
 
             # example 4:
             # data type is int32
-            result_4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
+            out4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
             # [-5, 4, -4]  # random
 
             # example 5:
             # Input only one parameter
             # low=0, high=10, shape=[1], dtype='int64'
-            result_5 = paddle.randint(10)
+            out5 = paddle.randint(10)
             # [7]  # random
 
     """
@@ -537,11 +499,11 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.randint('shape', shape, 'low', low, 'high', high,
                                 'seed', 0, 'dtype', dtype)
 
-    check_type(shape, 'shape', (list, tuple, Variable), 'randint')
+    check_shape(shape, 'randint')
     check_dtype(dtype, 'dtype', ['int32', 'int64'], 'randint')
     if low >= high:
         raise ValueError(
@@ -550,7 +512,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
 
     inputs = dict()
     attrs = {'low': low, 'high': high, 'seed': 0, 'dtype': dtype}
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs, attrs=attrs, shape=shape, op_type='randint')
 
     helper = LayerHelper("randint", **locals())
@@ -560,21 +522,17 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     return out
 
 
-@templatedoc()
 def randperm(n, dtype="int64", name=None):
     """
-	:alias_main: paddle.randperm
-	:alias: paddle.tensor.randperm, paddle.tensor.random.randperm
-
     This OP returns a 1-D Tensor filled with random permutation values from 0
     to n-1, with ``dtype``.
 
     Args:
-        n(int): The upper bound (exclusive), and it should be greater than 0.
-        dtype(str|np.dtype, optional): The data type of
+        n (int): The upper bound (exclusive), and it should be greater than 0.
+        dtype (str|np.dtype, optional): The data type of
             the output Tensor. Supported data types: int32, int64, float32,
             float64. Default is int64.
-        name(str, optional): The default value is None. Normally there is no
+        name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
@@ -582,10 +540,6 @@ def randperm(n, dtype="int64", name=None):
         Tensor: A 1-D Tensor filled with random permutation values from 0
         to n-1, with ``dtype``.
 
-    Raises:
-        ValueError: If ``n`` is not greater than 0.
-        TypeError: If ``dtype`` is not int32, int64, float32, float64.
-
     Examples:
         .. code-block:: python
 
@@ -593,10 +547,10 @@ def randperm(n, dtype="int64", name=None):
 
             paddle.disable_static()
 
-            result_1 = paddle.randperm(5)
+            out1 = paddle.randperm(5)
             # [4, 1, 2, 3, 0]  # random
 
-            result_2 = paddle.randperm(7, 'int32')
+            out2 = paddle.randperm(7, 'int32')
             # [1, 6, 2, 0, 4, 3, 5]  # random
  
     """
@@ -622,32 +576,20 @@ def randperm(n, dtype="int64", name=None):
 
 def rand(shape, dtype=None, name=None):
     """
-	:alias_main: paddle.rand
-	:alias: paddle.tensor.rand, paddle.tensor.random.rand
-
     This OP returns a Tensor filled with random values sampled from a uniform
     distribution in the range [0, 1), with ``shape`` and ``dtype``.
 
-    Examples:
-    ::
-
-        Input:
-          shape = [1, 2]
-
-        Output:
-          result=[[0.8505902, 0.8397286]]
-
     Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
             is a list or tuple, the elements of it should be integers or Tensors
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        dtype(str|np.dtype, optional): The data type of the output Tensor.
+        dtype (str|np.dtype, optional): The data type of the output Tensor.
             Supported data types: float32, float64.
             Default is None, use global default dtype (see ``get_default_dtype``
             for details).
-        name(str, optional): The default value is None. Normally there is no
+        name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
@@ -655,26 +597,21 @@ def rand(shape, dtype=None, name=None):
         Tensor: A Tensor filled with random values sampled from a uniform
         distribution in the range [0, 1), with ``shape`` and ``dtype``.
 
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        ValueError: If ``dtype`` is not float32, float64.
-
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
             # example 1: attr shape is a list which doesn't contain Tensor.
-            result_1 = paddle.rand(shape=[2, 3])
+            out1 = paddle.rand(shape=[2, 3])
             # [[0.451152  , 0.55825245, 0.403311  ],  # random
             #  [0.22550228, 0.22106001, 0.7877319 ]]  # random
 
             # example 2: attr shape is a list which contains Tensor.
-            dim_1 = paddle.fill_constant([1], "int64", 2)
-            dim_2 = paddle.fill_constant([1], "int32", 3)
-            result_2 = paddle.rand(shape=[dim_1, dim_2, 2])
+            dim1 = paddle.full([1], 2, "int64")
+            dim2 = paddle.full([1], 3, "int32")
+            out2 = paddle.rand(shape=[dim1, dim2, 2])
             # [[[0.8879919 , 0.25788337],  # random
             #   [0.28826773, 0.9712097 ],  # random
             #   [0.26438272, 0.01796806]],  # random
@@ -683,19 +620,11 @@ def rand(shape, dtype=None, name=None):
             #   [0.870881  , 0.2984597 ]]]  # random
 
             # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-            var_shape = paddle.to_variable(np.array([2, 3]))
-            result_3 = paddle.rand(var_shape)
+            shape_tensor = paddle.to_tensor([2, 3])
+            result_3 = paddle.rand(shape_tensor)
+
             # [[0.22920267, 0.841956  , 0.05981819],  # random
             #  [0.4836288 , 0.24573246, 0.7516129 ]]  # random
 
     """
-    if dtype is None:
-        dtype = paddle.framework.get_default_dtype()
-        if dtype not in ['float32', 'float64']:
-            raise TypeError(
-                "rand only supports [float32, float64], but the default dtype is %s"
-                % dtype)
-
-    out = uniform(shape, dtype, min=0.0, max=1.0, name=name)
-    out.stop_gradient = True
-    return out
+    return uniform(shape, dtype, min=0.0, max=1.0, name=name)
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index eede022e05ba61bc23da517e7af7cd2eb58f5416..ce03d0ef15f0f80f4e01cf57bc8cc449186c2560 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -18,7 +18,6 @@ from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtyp
 from ..fluid import core, layers
 
 # TODO: define searching & indexing functions of a tensor  
-from ..fluid.layers import argmin  #DEFINE_ALIAS
 from ..fluid.layers import has_inf  #DEFINE_ALIAS
 from ..fluid.layers import has_nan  #DEFINE_ALIAS
 
@@ -67,16 +66,15 @@ def argsort(x, axis=-1, descending=False, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
             
             paddle.disable_static()
-            input_array = np.array([[[5,8,9,5],
-                            [0,0,1,7],
-                            [6,9,2,4]],
-                            [[5,2,4,2],
-                            [4,7,7,9],
-                            [1,7,0,6]]]).astype(np.float32)
-            x = paddle.to_variable(input_array)
+            x = paddle.to_tensor([[[5,8,9,5],
+                                   [0,0,1,7],
+                                   [6,9,2,4]],
+                                  [[5,2,4,2],
+                                   [4,7,7,9],
+                                   [1,7,0,6]]], 
+                                dtype='float32')
             out1 = paddle.argsort(x=x, axis=-1)
             out2 = paddle.argsort(x=x, axis=0)
             out3 = paddle.argsort(x=x, axis=1)
@@ -124,7 +122,7 @@ def argsort(x, axis=-1, descending=False, name=None):
     return ids
 
 
-def argmax(x, axis=None, dtype=None, keepdim=False, name=None):
+def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
     """
     This OP computes the indices of the max elements of the input tensor's
     element along the provided axis.
@@ -135,10 +133,10 @@ def argmax(x, axis=None, dtype=None, keepdim=False, name=None):
         axis(int, optional): Axis to compute indices along. The effective range
             is [-R, R), where R is x.ndim. when axis < 0, it works the same way
             as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
-        dtype(str): Data type of the output tensor which can
-                    be int32, int64. The default value is None, and it will
-                    return the int64 indices.
         keepdim(bool, optional): Keep the axis that selecting max. The defalut value is False.
+        dtype(str|np.dtype, optional): Data type of the output tensor which can
+                    be int32, int64. The default value is 'int64', and it will
+                    return the int64 indices.
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
@@ -149,14 +147,12 @@ def argmax(x, axis=None, dtype=None, keepdim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            data = np.array([[5,8,9,5],
-                             [0,0,1,7],
-                             [6,9,2,4]])
-            x =  paddle.to_variable(data)
+            x =  paddle.to_tensor([[5,8,9,5],
+                                     [0,0,1,7],
+                                     [6,9,2,4]])
             out1 = paddle.argmax(x)
             print(out1.numpy()) # 2
             out2 = paddle.argmax(x, axis=1)
@@ -166,48 +162,45 @@ def argmax(x, axis=None, dtype=None, keepdim=False, name=None):
             print(out3.numpy()) 
             # [2 3 1]
     """
+    if axis is not None and not isinstance(axis, int):
+        raise TypeError(
+            "The type of 'axis'  must be int or None in argmax, but received %s."
+            % (type(axis)))
+
+    if not (isinstance(dtype, str) or isinstance(dtype, np.dtype)):
+        raise TypeError(
+            "the type of 'dtype' in argmax must be str or np.dtype, but received {}".
+            format(type(dtype)))
+
+    var_dtype = convert_np_dtype_to_dtype_(dtype)
+    check_dtype(var_dtype, 'dtype', ['int32', 'int64'], 'argmin')
     flatten = False
     if axis is None:
         flatten = True
         axis = 0
 
     if in_dygraph_mode():
-        if dtype != None:
-            var_dtype = convert_np_dtype_to_dtype_(dtype)
-            out = core.ops.arg_max(x, 'axis', axis, 'dtype', var_dtype,
-                                   'keepdim', keepdim, 'flatten', flatten)
-        else:
-            out = core.ops.arg_max(x, 'axis', axis, 'keepdim', keepdim,
-                                   'flatten', flatten)
+        out = core.ops.arg_max(x, 'axis', axis, 'dtype', var_dtype, 'keepdims',
+                               keepdim, 'flatten', flatten)
         return out
 
     helper = LayerHelper("argmax", **locals())
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
         'paddle.argmax')
-    var_dtype = None
     attrs = {}
-    if dtype is not None:
-        if dtype not in ['int32', 'int64']:
-            raise ValueError(
-                "The value of 'dtype' in argmax op must be int32, int64, but received of {}".
-                format(dtype))
-        var_dtype = convert_np_dtype_to_dtype_(dtype)
-        attrs["dtype"] = var_dtype
-    else:
-        var_dtype = VarDesc.VarType.INT64
-
     out = helper.create_variable_for_type_inference(var_dtype)
     attrs['keepdims'] = keepdim
     attrs['axis'] = axis
     attrs['flatten'] = flatten
+    attrs['dtype'] = var_dtype
     helper.append_op(
         type='arg_max', inputs={'X': x}, outputs={'Out': [out]}, attrs=attrs)
     out.stop_gradient = True
     return out
 
 
-def argmin(x, axis=None, dtype=None, keepdim=False, name=None):
+def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
     """
     This OP computes the indices of the min elements of the input tensor's
     element along the provided axis.
@@ -218,10 +211,10 @@ def argmin(x, axis=None, dtype=None, keepdim=False, name=None):
         axis(int, optional): Axis to compute indices along. The effective range
             is [-R, R), where R is x.ndim. when axis < 0, it works the same way
             as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
+        keepdim(bool, optional): Keep the axis that selecting min. The defalut value is False.
         dtype(str): Data type of the output tensor which can
-                    be int32, int64. The default value is None, and it will
+                    be int32, int64. The default value is 'int64', and it will
                     return the int64 indices.
-        keepdim(bool, optional): Keep the axis that selecting min. The defalut value is False.
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
@@ -232,14 +225,12 @@ def argmin(x, axis=None, dtype=None, keepdim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            data = np.array([[5,8,9,5],
-                             [0,0,1,7],
-                             [6,9,2,4]])
-            x =  paddle.to_variable(data)
+            x =  paddle.to_tensor([[5,8,9,5],
+                                     [0,0,1,7],
+                                     [6,9,2,4]])
             out1 = paddle.argmin(x)
             print(out1.numpy()) # 4
             out2 = paddle.argmin(x, axis=1)
@@ -249,41 +240,38 @@ def argmin(x, axis=None, dtype=None, keepdim=False, name=None):
             print(out3.numpy()) 
             # [0 0 2]
     """
+    if axis is not None and not isinstance(axis, int):
+        raise TypeError(
+            "The type of 'axis'  must be int or None in argmin, but received %s."
+            % (type(axis)))
+
+    if not (isinstance(dtype, str) or isinstance(dtype, np.dtype)):
+        raise TypeError(
+            "the type of 'dtype' in argmin must be str or np.dtype, but received {}".
+            format(dtype(dtype)))
+
+    var_dtype = convert_np_dtype_to_dtype_(dtype)
+    check_dtype(var_dtype, 'dtype', ['int32', 'int64'], 'argmin')
     flatten = False
     if axis is None:
         flatten = True
         axis = 0
 
     if in_dygraph_mode():
-        if dtype != None:
-            var_dtype = convert_np_dtype_to_dtype_(dtype)
-            out = core.ops.arg_min(x, 'axis', axis, 'dtype', var_dtype,
-                                   'keepdim', keepdim, 'flatten', flatten)
-        else:
-            out = core.ops.arg_min(x, 'axis', axis, 'keepdim', keepdim,
-                                   'flatten', flatten)
+        out = core.ops.arg_min(x, 'axis', axis, 'dtype', var_dtype, 'keepdims',
+                               keepdim, 'flatten', flatten)
         return out
 
     helper = LayerHelper("argmin", **locals())
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
         'paddle.argmin')
-    var_dtype = None
-    attrs = {}
-    if dtype is not None:
-        if dtype not in ['int32', 'int64']:
-            raise ValueError(
-                "The value of 'dtype' in argmin op must be int32, int64, but received of {}".
-                format(dtype))
-        var_dtype = convert_np_dtype_to_dtype_(dtype)
-        attrs["dtype"] = var_dtype
-    else:
-        var_dtype = VarDesc.VarType.INT64
-
     out = helper.create_variable_for_type_inference(var_dtype)
+    attrs = {}
     attrs['keepdims'] = keepdim
     attrs['axis'] = axis
     attrs['flatten'] = flatten
+    attrs['dtype'] = var_dtype
     helper.append_op(
         type='arg_min', inputs={'X': x}, outputs={'Out': [out]}, attrs=attrs)
     out.stop_gradient = True
@@ -311,24 +299,16 @@ def index_select(x, index, axis=0, name=None):
     Returns:
         Tensor: A Tensor with same data type as ``x``.
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of  float32, float64, int32 and int64.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
-
     Examples:
         .. code-block:: python
             
             import paddle
-            import numpy as np
 
             paddle.disable_static()  # Now we are in imperative mode
-            data = np.array([[1.0, 2.0, 3.0, 4.0],
-                             [5.0, 6.0, 7.0, 8.0],
-                             [9.0, 10.0, 11.0, 12.0]])
-            data_index = np.array([0, 1, 1]).astype('int32')
-
-            x = paddle.to_tensor(data)
-            index = paddle.to_tensor(data_index)
+            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
+                                  [5.0, 6.0, 7.0, 8.0],
+                                  [9.0, 10.0, 11.0, 12.0]])
+            index = paddle.to_tensor([0, 1, 1], dtype='int32')
             out_z1 = paddle.index_select(x=x, index=index)
             #[[1. 2. 3. 4.]
             # [5. 6. 7. 8.]
@@ -382,48 +362,44 @@ def nonzero(input, as_tuple=False):
     Examples:
         .. code-block:: python
             import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            data1 = np.array([[1.0, 0.0, 0.0],
-                              [0.0, 2.0, 0.0],
-                              [0.0, 0.0, 3.0]])
-            data2 = np.array([0.0, 1.0, 0.0, 3.0])
-            data3 = np.array([0.0, 0.0, 0.0])
-            with fluid.dygraph.guard():
-                x1 = fluid.dygraph.to_variable(data1)
-                x2 = fluid.dygraph.to_variable(data2)
-                x3 = fluid.dygraph.to_variable(data3)
-                out_z1 = paddle.nonzero(x1)
-                print(out_z1.numpy())
-                #[[0 0]
-                # [1 1]
-                # [2 2]]
-                out_z1_tuple = paddle.nonzero(x1, as_tuple=True)
-                for out in out_z1_tuple:
-                    print(out.numpy())
-                #[[0]
-                # [1]
-                # [2]]
-                #[[0]
-                # [1]
-                # [2]]
-                out_z2 = paddle.nonzero(x2)
-                print(out_z2.numpy())
-                #[[1]
-                # [3]]
-                out_z2_tuple = paddle.nonzero(x2, as_tuple=True)
-                for out in out_z2_tuple:
-                    print(out.numpy())
-                #[[1]
-                # [3]]
-                out_z3 = paddle.nonzero(x3)
-                print(out_z3.numpy())
-                #[]
-                out_z3_tuple = paddle.nonzero(x3, as_tuple=True)
-                for out in out_z3_tuple:
-                    print(out.numpy())
-                #[]                    
+
+            paddle.disable_static()
+
+            x1 = paddle.to_tensor([[1.0, 0.0, 0.0],
+                          [0.0, 2.0, 0.0],
+                          [0.0, 0.0, 3.0]])
+            x2 = paddle.to_tensor([0.0, 1.0, 0.0, 3.0])
+            x3 = paddle.to_tensor([0.0, 0.0, 0.0])
+            out_z1 = paddle.nonzero(x1)
+            print(out_z1.numpy())
+            #[[0 0]
+            # [1 1]
+            # [2 2]]
+            out_z1_tuple = paddle.nonzero(x1, as_tuple=True)
+            for out in out_z1_tuple:
+                print(out.numpy())
+            #[[0]
+            # [1]
+            # [2]]
+            #[[0]
+            # [1]
+            # [2]]
+            out_z2 = paddle.nonzero(x2)
+            print(out_z2.numpy())
+            #[[1]
+            # [3]]
+            out_z2_tuple = paddle.nonzero(x2, as_tuple=True)
+            for out in out_z2_tuple:
+                print(out.numpy())
+            #[[1]
+            # [3]]
+            out_z3 = paddle.nonzero(x3)
+            print(out_z3.numpy())
+            #[]
+            out_z3_tuple = paddle.nonzero(x3, as_tuple=True)
+            for out in out_z3_tuple:
+                print(out.numpy())
+            #[]                    
     """
     list_out = []
     shape = input.shape
@@ -470,16 +446,15 @@ def sort(x, axis=-1, descending=False, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
             
             paddle.disable_static()
-            input_array = np.array([[[5,8,9,5],
-                            [0,0,1,7],
-                            [6,9,2,4]],
-                            [[5,2,4,2],
-                            [4,7,7,9],
-                            [1,7,0,6]]]).astype(np.float32)
-            x = paddle.to_variable(input_array)
+            x = paddle.to_tensor([[[5,8,9,5],
+                                   [0,0,1,7],
+                                   [6,9,2,4]],
+                                  [[5,2,4,2],
+                                   [4,7,7,9],
+                                   [1,7,0,6]]], 
+                                 dtype='float32')
             out1 = paddle.sort(x=x, axis=-1)
             out2 = paddle.sort(x=x, axis=0)
             out3 = paddle.sort(x=x, axis=1)
@@ -555,16 +530,11 @@ def where(condition, x, y, name=None):
         .. code-block:: python
 
           import paddle
-          import numpy as np
-          import paddle.fluid as fluid
-
-          x_i = np.array([0.9383, 0.1983, 3.2, 1.2]).astype("float32")
-          y_i = np.array([1.0, 1.0, 1.0, 1.0]).astype("float32")
 
-          with fluid.dygraph.guard():
-              x = fluid.dygraph.to_variable(x_i)
-              y = fluid.dygraph.to_variable(y_i)
-              out = paddle.where(x>1, x, y)
+          paddle.disable_static()
+          x = paddle.to_tensor([0.9383, 0.1983, 3.2, 1.2])
+          y = paddle.to_tensor([1.0, 1.0, 1.0, 1.0])
+          out = paddle.where(x>1, x, y)
 
           print(out.numpy())
           #out: [1.0, 1.0, 3.2, 1.2]
@@ -641,50 +611,41 @@ def index_sample(x, index):
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            data = np.array([[1.0, 2.0, 3.0, 4.0],
-                                [5.0, 6.0, 7.0, 8.0],
-                                [9.0, 10.0, 11.0, 12.0]]).astype('float32')
-
-            data_index = np.array([[0, 1, 2],
-                                    [1, 2, 3],
-                                    [0, 0, 0]]).astype('int32')
-
-            target_data = np.array([[100, 200, 300, 400],
-                                    [500, 600, 700, 800],
-                                    [900, 1000, 1100, 1200]]).astype('int32')
-
-            with fluid.dygraph.guard():
-                x = fluid.dygraph.to_variable(data)
-                index = fluid.dygraph.to_variable(data_index)
-                target = fluid.dygraph.to_variable(target_data)
-
-                out_z1 = paddle.index_sample(x, index)
-                print(out_z1.numpy())
-                #[[1. 2. 3.]
-                # [6. 7. 8.]
-                # [9. 9. 9.]]
-
-                # Use the index of the maximum value by topk op
-                # get the value of the element of the corresponding index in other tensors
-                top_value, top_index = fluid.layers.topk(x, k=2)
-                out_z2 = paddle.index_sample(target, top_index)
-                print(top_value.numpy())
-                #[[ 4.  3.]
-                # [ 8.  7.]
-                # [12. 11.]]
-
-                print(top_index.numpy())
-                #[[3 2]
-                # [3 2]
-                # [3 2]]
-
-                print(out_z2.numpy())
-                #[[ 400  300]
-                # [ 800  700]
-                # [1200 1100]]
+
+            paddle.disable_static()
+            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
+                                  [5.0, 6.0, 7.0, 8.0],
+                                  [9.0, 10.0, 11.0, 12.0]], dtype='float32')
+            index = paddle.to_tensor([[0, 1, 2],
+                                      [1, 2, 3],
+                                      [0, 0, 0]], dtype='int32')
+            target = paddle.to_tensor([[100, 200, 300, 400],
+                                       [500, 600, 700, 800],
+                                       [900, 1000, 1100, 1200]], dtype='int32')
+            out_z1 = paddle.index_sample(x, index)
+            print(out_z1.numpy())
+            #[[1. 2. 3.]
+            # [6. 7. 8.]
+            # [9. 9. 9.]]
+
+            # Use the index of the maximum value by topk op
+            # get the value of the element of the corresponding index in other tensors
+            top_value, top_index = paddle.topk(x, k=2)
+            out_z2 = paddle.index_sample(target, top_index)
+            print(top_value.numpy())
+            #[[ 4.  3.]
+            # [ 8.  7.]
+            # [12. 11.]]
+
+            print(top_index.numpy())
+            #[[3 2]
+            # [3 2]
+            # [3 2]]
+
+            print(out_z2.numpy())
+            #[[ 400  300]
+            # [ 800  700]
+            # [1200 1100]]
 
 
     """
@@ -717,27 +678,20 @@ def masked_select(x, mask, name=None):
 
     Returns: A 1-D Tensor which is the same data type  as ``x``.
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of  float32, float64, int32 and int64.
-        TypeError: ``mask`` must be a Tensor and the data type of ``mask`` must be bool.
-
     Examples:
 
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            
+
             paddle.disable_static()
-            data = np.array([[1.0, 2.0, 3.0, 4.0],
-                                [5.0, 6.0, 7.0, 8.0],
-                                [9.0, 10.0, 11.0, 12.0]]).astype('float32')
-            
-            mask_data = np.array([[True, False, False, False],
-                            [True, True, False, False],
-                            [True, False, False, False]]).astype('bool')
-            x = paddle.to_tensor(data)
-            mask = paddle.to_tensor(mask_data)
+
+            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
+                                  [5.0, 6.0, 7.0, 8.0],
+                                  [9.0, 10.0, 11.0, 12.0]])
+            mask = paddle.to_tensor([[True, False, False, False],
+                                     [True, True, False, False],
+                                     [True, False, False, False]])
             out = paddle.masked_select(x, mask)
             #[1.0 5.0 6.0 9.0]
     """
@@ -782,20 +736,17 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None):
 
         .. code-block:: python
 
-           import numpy as np
            import paddle
 
            paddle.disable_static()
 
-           data_1 = np.array([1, 4, 5, 7])
-           tensor_1 = paddle.to_tensor(data_1)
+           tensor_1 = paddle.to_tensor([1, 4, 5, 7])
            value_1, indices_1 = paddle.topk(tensor_1, k=1)
            print(value_1.numpy())
            # [7]
            print(indices_1.numpy())
            # [3] 
-           data_2 = np.array([[1, 4, 5, 7], [2, 6, 2, 5]])
-           tensor_2 = paddle.to_tensor(data_2)
+           tensor_2 = paddle.to_tensor([[1, 4, 5, 7], [2, 6, 2, 5]])
            value_2, indices_2 = paddle.topk(tensor_2, k=1)
            print(value_2.numpy())
            # [[7]
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 91676a6316b81a1998b9b48fb9ea7fcba6d67c25..d56dff5a81018e13e1c186f66172f868b0c4074b 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -237,10 +237,6 @@ def numel(x, name=None):
 
     Returns:
         Tensor: The number of elements for the input Tensor.
-    
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
-
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 7b79b25cbc3e98b802bad87386ad0572ec6ab8d7..b7b5d44650f8d62926241a57feedfd5b932a37f5 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -416,6 +416,29 @@ class TestModelFunction(unittest.TestCase):
             shutil.rmtree(path)
             fluid.disable_dygraph() if dynamic else None
 
+    def test_dynamic_load(self):
+        mnist_data = MnistDataset(mode='train')
+        for new_optimizer in [True, False]:
+            path = tempfile.mkdtemp()
+            paddle.disable_static()
+            net = LeNet()
+            inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
+            labels = [InputSpec([None, 1], 'int64', 'label')]
+            if new_optimizer:
+                optim = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=net.parameters())
+            else:
+                optim = fluid.optimizer.Adam(
+                    learning_rate=0.001, parameter_list=net.parameters())
+            model = Model(net, inputs, labels)
+            model.prepare(
+                optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
+            model.fit(mnist_data, batch_size=64, verbose=0)
+            model.save(path + '/test')
+            model.load(path + '/test')
+            shutil.rmtree(path)
+            paddle.enable_static()
+
     def test_dynamic_save_static_load(self):
         path = tempfile.mkdtemp()
         # dynamic saving
@@ -476,6 +499,30 @@ class TestModelFunction(unittest.TestCase):
             self.assertTrue(params[0].shape[1] == 10)
             fluid.disable_dygraph() if dynamic else None
 
+    def test_summary(self):
+        def _get_param_from_state_dict(state_dict):
+            params = 0
+            for k, v in state_dict.items():
+                params += np.prod(v.numpy().shape)
+            return params
+
+        for dynamic in [True, False]:
+            device = paddle.set_device('cpu')
+            fluid.enable_dygraph(device) if dynamic else None
+            net = MyModel()
+            inputs = [InputSpec([None, 20], 'float32', 'x')]
+            model = Model(net, inputs)
+            model.prepare()
+            params_info = model.summary()
+            gt_params = _get_param_from_state_dict(net.state_dict())
+
+            np.testing.assert_allclose(params_info['total_params'], gt_params)
+            print(params_info)
+
+            model.summary(input_size=(20))
+            model.summary(input_size=[(20)])
+            model.summary(input_size=(20), batch_size=2)
+
     def test_export_deploy_model(self):
         for dynamic in [True, False]:
             fluid.enable_dygraph() if dynamic else None
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index f6299980b3e5c0bd0c7551b6b51c9b067d7960b5..2a649c776b4103b1d3d8648957bbff7a32007410 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .plot import Ploter
 from .profiler import ProfilerOptions
 from .profiler import Profiler
 from .profiler import get_profiler
 from .deprecated import deprecated
+
 from . import download
 
-__all__ = ['dump_config', 'Ploter', 'deprecated', 'download']
+__all__ = ['dump_config', 'deprecated', 'download']
 
 #TODO: define new api under this directory
 # __all__ = ['unique_name',
diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index 08fd7e33479b331454f63f05f6240dd221591ee9..d4e21748b55326468edb2ba1e46114e8d66c0046 100644
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -45,7 +45,7 @@ def deprecated(update_to="", since="", reason=""):
 
     def decorator(func):
         # TODO(zhiqiu): We temporally disable the warnings for 2.0-bata, and it should be re-enabled in the future.
-        return func
+        # return func
         """construct warning message, and return a decorated function or class."""
         assert isinstance(update_to, str), 'type of "update_to" must be str.'
         assert isinstance(since, str), 'type of "since" must be str.'
@@ -56,9 +56,10 @@ def deprecated(update_to="", since="", reason=""):
         _reason = reason.strip()
 
         msg = 'API "{}.{}" is deprecated'.format(func.__module__, func.__name__)
+
         if len(_since) > 0:
             msg += " since {}".format(_since)
-        msg += ", and may be removed in future versions."
+        msg += ", and will be removed in future versions."
         if len(_update_to) > 0:
             assert _update_to.startswith(
                 "paddle."
@@ -67,6 +68,11 @@ def deprecated(update_to="", since="", reason=""):
             msg += ' Please use "{}" instead.'.format(_update_to)
         if len(_reason) > 0:
             msg += "\n reason: {}".format(_reason)
+        if func.__doc__:
+            func.__doc__ = ('\n\nWarning: ' + msg + '\n') + func.__doc__
+        # TODO(Joejiong) Early returning the wrapper function, currently we disable the warning wrapper, 
+        # because the 2.0beta APIs are still under development, we will restore the warning functionality when 2.0 rc APIs become stable.
+        return func
 
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
@@ -75,6 +81,7 @@ def deprecated(update_to="", since="", reason=""):
                2. since version is empty, in this case, API is deprecated in all versions.
                3. current version is newer than since version.
             """
+            msg = "\033[93mWarning %s \033[0m" % (msg)
             v_current = [int(i) for i in paddle.__version__.split(".")]
             v_current += [0] * (4 - len(v_current))
             v_since = [int(i) for i in _since.split(".")]
diff --git a/python/paddle/utils/plot.py b/python/paddle/utils/plot.py
deleted file mode 100644
index ee651f2f0cd6f2e594a4e74c896baa924f70bbf5..0000000000000000000000000000000000000000
--- a/python/paddle/utils/plot.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import six
-
-
-class PlotData(object):
-    def __init__(self):
-        self.step = []
-        self.value = []
-
-    def append(self, step, value):
-        self.step.append(step)
-        self.value.append(value)
-
-    def reset(self):
-        self.step = []
-        self.value = []
-
-
-class Ploter(object):
-    """
-        Plot input data in a 2D graph
-        
-        Args:
-            title: assign the title of input data.
-            step: x_axis of the data.
-            value: y_axis of the data.
-    """
-
-    def __init__(self, *args):
-        self.__args__ = args
-        self.__plot_data__ = {}
-        for title in args:
-            self.__plot_data__[title] = PlotData()
-        # demo in notebooks will use Ploter to plot figure, but when we convert
-        # the ipydb to py file for testing, the import of matplotlib will make the
-        # script crash. So we can use `export DISABLE_PLOT=True` to disable import
-        # these libs
-        self.__disable_plot__ = os.environ.get("DISABLE_PLOT")
-        if not self.__plot_is_disabled__():
-            import matplotlib.pyplot as plt
-            from IPython import display
-            self.plt = plt
-            self.display = display
-
-    def __plot_is_disabled__(self):
-        return self.__disable_plot__ == "True"
-
-    def append(self, title, step, value):
-        """
-        Feed data
-
-        Args:
-                title: assign the group data to this subtitle.
-                step: the x_axis of data.
-                value: the y_axis of data.
-            
-            Examples:
-                .. code-block:: python
-                plot_curve = Ploter("Curve 1","Curve 2")
-                plot_curve.append(title="Curve 1",step=1,value=1)
-        """
-        assert isinstance(title, six.string_types)
-        assert title in self.__plot_data__
-        data = self.__plot_data__[title]
-        assert isinstance(data, PlotData)
-        data.append(step, value)
-
-    def plot(self, path=None):
-        """
-            Plot data in a 2D graph
-
-            Args:
-                path: store the figure to this file path. Defaul None. 
-              
-            Examples:
-                .. code-block:: python
-                plot_curve = Ploter()
-                plot_cure.plot()
-        """
-        if self.__plot_is_disabled__():
-            return
-
-        titles = []
-        for title in self.__args__:
-            data = self.__plot_data__[title]
-            assert isinstance(data, PlotData)
-            if len(data.step) > 0:
-                titles.append(title)
-                self.plt.plot(data.step, data.value)
-        self.plt.legend(titles, loc='upper left')
-        if path is None:
-            self.display.clear_output(wait=True)
-            self.display.display(self.plt.gcf())
-        else:
-            self.plt.savefig(path)
-        self.plt.gcf().clear()
-
-    def reset(self):
-        for key in self.__plot_data__:
-            data = self.__plot_data__[key]
-            assert isinstance(data, PlotData)
-            data.reset()
diff --git a/python/requirements.txt b/python/requirements.txt
index e278a1b824cc3829f1b67bc3a0cf643840990bb9..c8d3b2af1794bb0858b187d6a4c641322f50cdd1 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -21,4 +21,3 @@ objgraph
 astor
 pathlib
 netifaces
-psutil
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 2c575e4abf1beed039d3293821b8df356d4e9295..1e5179d0282d7f35c4232d9b9783cb831e83f462 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -19,8 +19,8 @@ API_FILES=("CMakeLists.txt"
            "paddle/fluid/framework/ir/node.h"
            "paddle/fluid/framework/ir/graph.h"
            "paddle/fluid/framework/framework.proto"
-	   "python/paddle/distributed/__init"
-	   "python/paddle/distributed/fleet/__init__.py"
+	    "python/paddle/distributed/__init"
+	    "python/paddle/distributed/fleet/__init__.py"
            "python/requirements.txt"
            "python/paddle/fluid/__init__.py"
            "python/paddle/fluid/compiler.py"
@@ -39,6 +39,7 @@ API_FILES=("CMakeLists.txt"
            "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py"
            "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py"
            "tools/wlist.json"
+           "paddle/scripts/paddle_build.bat"
            )
 
 approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
@@ -114,17 +115,20 @@ for API_FILE in ${API_FILES[*]}; do
           echo_line="You must have one RD (luotao1 or phlrain) approval for ${API_FILE}, which manages the white list of batch size 1 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-batch_size=1-in-sequence-OP-test]. \n"
           check_approval 1 6836917 43953930
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py" ];then
-        echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n"
-        check_approval 1 39303645 6836917 43953930
+          echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n"
+          check_approval 1 39303645 6836917 43953930
       elif [ "${API_FILE}" == "tools/wlist.json" ];then
-        echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n"
-        check_approval 1 29231
+          echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n"
+          check_approval 1 29231
       elif [ "${API_FILE}" == "python/paddle/distributed/fleet/__init__.py" ]; then
-	echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
-	check_approval 1 35550832 38231817
+	      echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
+	      check_approval 1 35550832 38231817
       elif [ "${API_FILE}" == "python/paddle/distributed/__init__.py" ]; then
-	echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
-	check_approval 1 35550832 38231817
+	      echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
+	      check_approval 1 35550832 38231817
+      elif [ "${API_FILE}" == "paddle/scripts/paddle_build.bat" ]; then
+	      echo_line="You must have one RD (zhouwei25 (Recommend), luotao1) approval for ${API_FILE} changes, which manages all Paddle CI task on Windows.\n"
+	      check_approval 1 52485244 6836917
       else
           echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
           check_approval 1 3048612 46782768 12538138 6836917
@@ -159,7 +163,7 @@ fi
 
 HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH | grep "^+[[:space:]]\{0,\}@unittest.skip" || true`
 if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), liuwei1031, or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
+    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
     check_approval 1 22165420 6836917 46661762
   fi
 
diff --git a/tools/get_cpu_info.sh b/tools/get_cpu_info.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a1881f551da1ca022c186c50c667e51dff89f9be
--- /dev/null
+++ b/tools/get_cpu_info.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+if [ "`uname -s`" != "Linux" ]; then
+  echo "Current scenario only support in Linux yet!"
+  exit 0
+fi
+
+echo "********** Hardware Information **********"
+sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l`
+cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l`
+ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs`
+physical_cores=$((sockets * cores_per_socket))
+virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
+numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs`
+echo "CPU Name               : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`"
+echo "CPU Family             : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`"
+echo "Socket Number          : $sockets"
+echo "Cores Per Socket       : $cores_per_socket"
+echo "Total Physical Cores   : $physical_cores"
+echo "Total Virtual Cores    : $virtual_cores"
+if [ $ht -eq 1 ]; then
+  echo "Hyper Threading        : OFF"
+  if [ $physical_cores -ne $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+else
+  echo "Hyper Threading        : ON"
+  if [ $physical_cores -ge $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+fi
+echo "NUMA Nodes             : $numa_nodes"
+if [ $numa_nodes -lt $sockets ]; then
+  echo "Warning: NUMA node is not enough for the best performance,\
+ at least $sockets"
+fi
+
+echo "********** Software Information **********"
+echo "OS Version             : `cat /proc/version`"
+echo "Kernel Release Version : `uname -r`"
+echo "Kernel Patch Version   : `uname -v`"
+echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
+if command -v cmake >/dev/null 2>&1; then 
+  cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
+else
+  cmake_ver=" Not installed"
+fi
+echo "CMake Version          :$cmake_ver"
+echo "******************************************"